summaryrefslogtreecommitdiff
path: root/net/core
diff options
context:
space:
mode:
Diffstat (limited to 'net/core')
-rw-r--r--net/core/Makefile6
-rw-r--r--net/core/datagram.c24
-rw-r--r--net/core/dev.c523
-rw-r--r--net/core/dev_addr_lists.c24
-rw-r--r--net/core/dst.c15
-rw-r--r--net/core/ethtool.c732
-rw-r--r--net/core/fib_rules.c5
-rw-r--r--net/core/filter.c4
-rw-r--r--net/core/flow.c26
-rw-r--r--net/core/flow_dissector.c143
-rw-r--r--net/core/kmap_skb.h2
-rw-r--r--net/core/link_watch.c9
-rw-r--r--net/core/neighbour.c277
-rw-r--r--net/core/net-sysfs.c339
-rw-r--r--net/core/net-traces.c1
-rw-r--r--net/core/net_namespace.c1
-rw-r--r--net/core/netevent.c1
-rw-r--r--net/core/netpoll.c13
-rw-r--r--net/core/netprio_cgroup.c344
-rw-r--r--net/core/pktgen.c46
-rw-r--r--net/core/request_sock.c7
-rw-r--r--net/core/rtnetlink.c59
-rw-r--r--net/core/scm.c10
-rw-r--r--net/core/secure_seq.c10
-rw-r--r--net/core/skbuff.c264
-rw-r--r--net/core/sock.c236
-rw-r--r--net/core/sock_diag.c192
-rw-r--r--net/core/sysctl_net_core.c9
-rw-r--r--net/core/timestamping.c13
-rw-r--r--net/core/user_dma.c7
30 files changed, 2139 insertions, 1203 deletions
diff --git a/net/core/Makefile b/net/core/Makefile
index 0d357b1c4e5..674641b13ae 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -3,12 +3,13 @@
#
obj-y := sock.o request_sock.o skbuff.o iovec.o datagram.o stream.o scm.o \
- gen_stats.o gen_estimator.o net_namespace.o secure_seq.o
+ gen_stats.o gen_estimator.o net_namespace.o secure_seq.o flow_dissector.o
obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
- neighbour.o rtnetlink.o utils.o link_watch.o filter.o
+ neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
+ sock_diag.o
obj-$(CONFIG_XFRM) += flow.o
obj-y += net-sysfs.o
@@ -19,3 +20,4 @@ obj-$(CONFIG_FIB_RULES) += fib_rules.o
obj-$(CONFIG_TRACEPOINTS) += net-traces.o
obj-$(CONFIG_NET_DROP_MONITOR) += drop_monitor.o
obj-$(CONFIG_NETWORK_PHY_TIMESTAMPING) += timestamping.o
+obj-$(CONFIG_NETPRIO_CGROUP) += netprio_cgroup.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 18ac112ea7a..68bbf9f65cb 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -324,15 +324,15 @@ int skb_copy_datagram_iovec(const struct sk_buff *skb, int offset,
/* Copy paged appendix. Hmm... why does this look so complicated? */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
int err;
u8 *vaddr;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- struct page *page = frag->page;
+ struct page *page = skb_frag_page(frag);
if (copy > len)
copy = len;
@@ -410,15 +410,15 @@ int skb_copy_datagram_const_iovec(const struct sk_buff *skb, int offset,
/* Copy paged appendix. Hmm... why does this look so complicated? */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
int err;
u8 *vaddr;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- struct page *page = frag->page;
+ struct page *page = skb_frag_page(frag);
if (copy > len)
copy = len;
@@ -500,15 +500,15 @@ int skb_copy_datagram_from_iovec(struct sk_buff *skb, int offset,
/* Copy paged appendix. Hmm... why does this look so complicated? */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
int err;
u8 *vaddr;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- struct page *page = frag->page;
+ struct page *page = skb_frag_page(frag);
if (copy > len)
copy = len;
@@ -585,16 +585,16 @@ static int skb_copy_and_csum_datagram(const struct sk_buff *skb, int offset,
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
__wsum csum2;
int err = 0;
u8 *vaddr;
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- struct page *page = frag->page;
+ struct page *page = skb_frag_page(frag);
if (copy > len)
copy = len;
diff --git a/net/core/dev.c b/net/core/dev.c
index b10ff0a7185..f494675471a 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -133,6 +133,9 @@
#include <linux/pci.h>
#include <linux/inetdevice.h>
#include <linux/cpu_rmap.h>
+#include <linux/net_tstamp.h>
+#include <linux/jump_label.h>
+#include <net/flow_keys.h>
#include "net-sysfs.h"
@@ -1316,8 +1319,6 @@ EXPORT_SYMBOL(dev_close);
*/
void dev_disable_lro(struct net_device *dev)
{
- u32 flags;
-
/*
* If we're trying to disable lro on a vlan device
* use the underlying physical device instead
@@ -1325,15 +1326,9 @@ void dev_disable_lro(struct net_device *dev)
if (is_vlan_dev(dev))
dev = vlan_dev_real_dev(dev);
- if (dev->ethtool_ops && dev->ethtool_ops->get_flags)
- flags = dev->ethtool_ops->get_flags(dev);
- else
- flags = ethtool_op_get_flags(dev);
-
- if (!(flags & ETH_FLAG_LRO))
- return;
+ dev->wanted_features &= ~NETIF_F_LRO;
+ netdev_update_features(dev);
- __ethtool_set_flags(dev, flags & ~ETH_FLAG_LRO);
if (unlikely(dev->features & NETIF_F_LRO))
netdev_WARN(dev, "failed to disable LRO!\n");
}
@@ -1392,7 +1387,7 @@ rollback:
for_each_net(net) {
for_each_netdev(net, dev) {
if (dev == last)
- break;
+ goto outroll;
if (dev->flags & IFF_UP) {
nb->notifier_call(nb, NETDEV_GOING_DOWN, dev);
@@ -1403,6 +1398,7 @@ rollback:
}
}
+outroll:
raw_notifier_chain_unregister(&netdev_chain, nb);
goto unlock;
}
@@ -1445,33 +1441,105 @@ int call_netdevice_notifiers(unsigned long val, struct net_device *dev)
}
EXPORT_SYMBOL(call_netdevice_notifiers);
-/* When > 0 there are consumers of rx skb time stamps */
-static atomic_t netstamp_needed = ATOMIC_INIT(0);
+static struct jump_label_key netstamp_needed __read_mostly;
+#ifdef HAVE_JUMP_LABEL
+/* We are not allowed to call jump_label_dec() from irq context
+ * If net_disable_timestamp() is called from irq context, defer the
+ * jump_label_dec() calls.
+ */
+static atomic_t netstamp_needed_deferred;
+#endif
void net_enable_timestamp(void)
{
- atomic_inc(&netstamp_needed);
+#ifdef HAVE_JUMP_LABEL
+ int deferred = atomic_xchg(&netstamp_needed_deferred, 0);
+
+ if (deferred) {
+ while (--deferred)
+ jump_label_dec(&netstamp_needed);
+ return;
+ }
+#endif
+ WARN_ON(in_interrupt());
+ jump_label_inc(&netstamp_needed);
}
EXPORT_SYMBOL(net_enable_timestamp);
void net_disable_timestamp(void)
{
- atomic_dec(&netstamp_needed);
+#ifdef HAVE_JUMP_LABEL
+ if (in_interrupt()) {
+ atomic_inc(&netstamp_needed_deferred);
+ return;
+ }
+#endif
+ jump_label_dec(&netstamp_needed);
}
EXPORT_SYMBOL(net_disable_timestamp);
static inline void net_timestamp_set(struct sk_buff *skb)
{
- if (atomic_read(&netstamp_needed))
+ skb->tstamp.tv64 = 0;
+ if (static_branch(&netstamp_needed))
__net_timestamp(skb);
- else
- skb->tstamp.tv64 = 0;
}
-static inline void net_timestamp_check(struct sk_buff *skb)
+#define net_timestamp_check(COND, SKB) \
+ if (static_branch(&netstamp_needed)) { \
+ if ((COND) && !(SKB)->tstamp.tv64) \
+ __net_timestamp(SKB); \
+ } \
+
+static int net_hwtstamp_validate(struct ifreq *ifr)
{
- if (!skb->tstamp.tv64 && atomic_read(&netstamp_needed))
- __net_timestamp(skb);
+ struct hwtstamp_config cfg;
+ enum hwtstamp_tx_types tx_type;
+ enum hwtstamp_rx_filters rx_filter;
+ int tx_type_valid = 0;
+ int rx_filter_valid = 0;
+
+ if (copy_from_user(&cfg, ifr->ifr_data, sizeof(cfg)))
+ return -EFAULT;
+
+ if (cfg.flags) /* reserved for future extensions */
+ return -EINVAL;
+
+ tx_type = cfg.tx_type;
+ rx_filter = cfg.rx_filter;
+
+ switch (tx_type) {
+ case HWTSTAMP_TX_OFF:
+ case HWTSTAMP_TX_ON:
+ case HWTSTAMP_TX_ONESTEP_SYNC:
+ tx_type_valid = 1;
+ break;
+ }
+
+ switch (rx_filter) {
+ case HWTSTAMP_FILTER_NONE:
+ case HWTSTAMP_FILTER_ALL:
+ case HWTSTAMP_FILTER_SOME:
+ case HWTSTAMP_FILTER_PTP_V1_L4_EVENT:
+ case HWTSTAMP_FILTER_PTP_V1_L4_SYNC:
+ case HWTSTAMP_FILTER_PTP_V1_L4_DELAY_REQ:
+ case HWTSTAMP_FILTER_PTP_V2_L4_EVENT:
+ case HWTSTAMP_FILTER_PTP_V2_L4_SYNC:
+ case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ:
+ case HWTSTAMP_FILTER_PTP_V2_L2_EVENT:
+ case HWTSTAMP_FILTER_PTP_V2_L2_SYNC:
+ case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ:
+ case HWTSTAMP_FILTER_PTP_V2_EVENT:
+ case HWTSTAMP_FILTER_PTP_V2_SYNC:
+ case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ:
+ rx_filter_valid = 1;
+ break;
+ }
+
+ if (!tx_type_valid || !rx_filter_valid)
+ return -ERANGE;
+
+ return 0;
}
static inline bool is_skb_forwardable(struct net_device *dev,
@@ -1868,7 +1936,8 @@ EXPORT_SYMBOL(skb_checksum_help);
* It may return NULL if the skb requires no segmentation. This is
* only possible when GSO is used for verifying header integrity.
*/
-struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
+struct sk_buff *skb_gso_segment(struct sk_buff *skb,
+ netdev_features_t features)
{
struct sk_buff *segs = ERR_PTR(-EPROTONOSUPPORT);
struct packet_type *ptype;
@@ -1898,9 +1967,9 @@ struct sk_buff *skb_gso_segment(struct sk_buff *skb, u32 features)
if (dev && dev->ethtool_ops && dev->ethtool_ops->get_drvinfo)
dev->ethtool_ops->get_drvinfo(dev, &info);
- WARN(1, "%s: caps=(0x%lx, 0x%lx) len=%d data_len=%d ip_summed=%d\n",
- info.driver, dev ? dev->features : 0L,
- skb->sk ? skb->sk->sk_route_caps : 0L,
+ WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d ip_summed=%d\n",
+ info.driver, dev ? &dev->features : NULL,
+ skb->sk ? &skb->sk->sk_route_caps : NULL,
skb->len, skb->data_len, skb->ip_summed);
if (skb_header_cloned(skb) &&
@@ -1955,9 +2024,11 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
#ifdef CONFIG_HIGHMEM
int i;
if (!(dev->features & NETIF_F_HIGHDMA)) {
- for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- if (PageHighMem(skb_shinfo(skb)->frags[i].page))
+ for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ if (PageHighMem(skb_frag_page(frag)))
return 1;
+ }
}
if (PCI_DMA_BUS_IS_PHYS) {
@@ -1966,7 +2037,8 @@ static int illegal_highdma(struct net_device *dev, struct sk_buff *skb)
if (!pdev)
return 0;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- dma_addr_t addr = page_to_phys(skb_shinfo(skb)->frags[i].page);
+ skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
+ dma_addr_t addr = page_to_phys(skb_frag_page(frag));
if (!pdev->dma_mask || addr + PAGE_SIZE - 1 > *pdev->dma_mask)
return 1;
}
@@ -2006,7 +2078,7 @@ static void dev_gso_skb_destructor(struct sk_buff *skb)
* This function segments the given skb and stores the list of segments
* in skb->next.
*/
-static int dev_gso_segment(struct sk_buff *skb, int features)
+static int dev_gso_segment(struct sk_buff *skb, netdev_features_t features)
{
struct sk_buff *segs;
@@ -2045,7 +2117,7 @@ static inline void skb_orphan_try(struct sk_buff *skb)
}
}
-static bool can_checksum_protocol(unsigned long features, __be16 protocol)
+static bool can_checksum_protocol(netdev_features_t features, __be16 protocol)
{
return ((features & NETIF_F_GEN_CSUM) ||
((features & NETIF_F_V4_CSUM) &&
@@ -2056,7 +2128,8 @@ static bool can_checksum_protocol(unsigned long features, __be16 protocol)
protocol == htons(ETH_P_FCOE)));
}
-static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features)
+static netdev_features_t harmonize_features(struct sk_buff *skb,
+ __be16 protocol, netdev_features_t features)
{
if (!can_checksum_protocol(features, protocol)) {
features &= ~NETIF_F_ALL_CSUM;
@@ -2068,10 +2141,10 @@ static u32 harmonize_features(struct sk_buff *skb, __be16 protocol, u32 features
return features;
}
-u32 netif_skb_features(struct sk_buff *skb)
+netdev_features_t netif_skb_features(struct sk_buff *skb)
{
__be16 protocol = skb->protocol;
- u32 features = skb->dev->features;
+ netdev_features_t features = skb->dev->features;
if (protocol == htons(ETH_P_8021Q)) {
struct vlan_ethhdr *veh = (struct vlan_ethhdr *)skb->data;
@@ -2117,7 +2190,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
unsigned int skb_len;
if (likely(!skb->next)) {
- u32 features;
+ netdev_features_t features;
/*
* If device doesn't need skb->dst, release it right now while
@@ -2198,7 +2271,7 @@ gso:
return rc;
}
txq_trans_update(txq);
- if (unlikely(netif_tx_queue_stopped(txq) && skb->next))
+ if (unlikely(netif_xmit_stopped(txq) && skb->next))
return NETDEV_TX_BUSY;
} while (skb->next);
@@ -2398,6 +2471,18 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
return rc;
}
+#if IS_ENABLED(CONFIG_NETPRIO_CGROUP)
+static void skb_update_prio(struct sk_buff *skb)
+{
+ struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
+
+ if ((!skb->priority) && (skb->sk) && map)
+ skb->priority = map->priomap[skb->sk->sk_cgrp_prioidx];
+}
+#else
+#define skb_update_prio(skb)
+#endif
+
static DEFINE_PER_CPU(int, xmit_recursion);
#define RECURSION_LIMIT 10
@@ -2438,6 +2523,8 @@ int dev_queue_xmit(struct sk_buff *skb)
*/
rcu_read_lock_bh();
+ skb_update_prio(skb);
+
txq = dev_pick_tx(dev, skb);
q = rcu_dereference_bh(txq->qdisc);
@@ -2472,7 +2559,7 @@ int dev_queue_xmit(struct sk_buff *skb)
HARD_TX_LOCK(dev, txq, cpu);
- if (!netif_tx_queue_stopped(txq)) {
+ if (!netif_xmit_stopped(txq)) {
__this_cpu_inc(xmit_recursion);
rc = dev_hard_start_xmit(skb, dev, txq);
__this_cpu_dec(xmit_recursion);
@@ -2527,72 +2614,35 @@ static inline void ____napi_schedule(struct softnet_data *sd,
/*
* __skb_get_rxhash: calculate a flow hash based on src/dst addresses
- * and src/dst port numbers. Returns a non-zero hash number on success
- * and 0 on failure.
+ * and src/dst port numbers. Sets rxhash in skb to non-zero hash value
+ * on success, zero indicates no valid hash. Also, sets l4_rxhash in skb
+ * if hash is a canonical 4-tuple hash over transport ports.
*/
-__u32 __skb_get_rxhash(struct sk_buff *skb)
-{
- int nhoff, hash = 0, poff;
- const struct ipv6hdr *ip6;
- const struct iphdr *ip;
- u8 ip_proto;
- u32 addr1, addr2, ihl;
- union {
- u32 v32;
- u16 v16[2];
- } ports;
-
- nhoff = skb_network_offset(skb);
-
- switch (skb->protocol) {
- case __constant_htons(ETH_P_IP):
- if (!pskb_may_pull(skb, sizeof(*ip) + nhoff))
- goto done;
+void __skb_get_rxhash(struct sk_buff *skb)
+{
+ struct flow_keys keys;
+ u32 hash;
- ip = (const struct iphdr *) (skb->data + nhoff);
- if (ip_is_fragment(ip))
- ip_proto = 0;
- else
- ip_proto = ip->protocol;
- addr1 = (__force u32) ip->saddr;
- addr2 = (__force u32) ip->daddr;
- ihl = ip->ihl;
- break;
- case __constant_htons(ETH_P_IPV6):
- if (!pskb_may_pull(skb, sizeof(*ip6) + nhoff))
- goto done;
+ if (!skb_flow_dissect(skb, &keys))
+ return;
- ip6 = (const struct ipv6hdr *) (skb->data + nhoff);
- ip_proto = ip6->nexthdr;
- addr1 = (__force u32) ip6->saddr.s6_addr32[3];
- addr2 = (__force u32) ip6->daddr.s6_addr32[3];
- ihl = (40 >> 2);
- break;
- default:
- goto done;
- }
-
- ports.v32 = 0;
- poff = proto_ports_offset(ip_proto);
- if (poff >= 0) {
- nhoff += ihl * 4 + poff;
- if (pskb_may_pull(skb, nhoff + 4)) {
- ports.v32 = * (__force u32 *) (skb->data + nhoff);
- if (ports.v16[1] < ports.v16[0])
- swap(ports.v16[0], ports.v16[1]);
- }
+ if (keys.ports) {
+ if ((__force u16)keys.port16[1] < (__force u16)keys.port16[0])
+ swap(keys.port16[0], keys.port16[1]);
+ skb->l4_rxhash = 1;
}
/* get a consistent hash (same value on both flow directions) */
- if (addr2 < addr1)
- swap(addr1, addr2);
+ if ((__force u32)keys.dst < (__force u32)keys.src)
+ swap(keys.dst, keys.src);
- hash = jhash_3words(addr1, addr2, ports.v32, hashrnd);
+ hash = jhash_3words((__force u32)keys.dst,
+ (__force u32)keys.src,
+ (__force u32)keys.ports, hashrnd);
if (!hash)
hash = 1;
-done:
- return hash;
+ skb->rxhash = hash;
}
EXPORT_SYMBOL(__skb_get_rxhash);
@@ -2602,14 +2652,13 @@ EXPORT_SYMBOL(__skb_get_rxhash);
struct rps_sock_flow_table __rcu *rps_sock_flow_table __read_mostly;
EXPORT_SYMBOL(rps_sock_flow_table);
+struct jump_label_key rps_needed __read_mostly;
+
static struct rps_dev_flow *
set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
struct rps_dev_flow *rflow, u16 next_cpu)
{
- u16 tcpu;
-
- tcpu = rflow->cpu = next_cpu;
- if (tcpu != RPS_NO_CPU) {
+ if (next_cpu != RPS_NO_CPU) {
#ifdef CONFIG_RFS_ACCEL
struct netdev_rx_queue *rxqueue;
struct rps_dev_flow_table *flow_table;
@@ -2637,16 +2686,16 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
goto out;
old_rflow = rflow;
rflow = &flow_table->flows[flow_id];
- rflow->cpu = next_cpu;
rflow->filter = rc;
if (old_rflow->filter == rflow->filter)
old_rflow->filter = RPS_NO_FILTER;
out:
#endif
rflow->last_qtail =
- per_cpu(softnet_data, tcpu).input_queue_head;
+ per_cpu(softnet_data, next_cpu).input_queue_head;
}
+ rflow->cpu = next_cpu;
return rflow;
}
@@ -2681,13 +2730,13 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
map = rcu_dereference(rxqueue->rps_map);
if (map) {
if (map->len == 1 &&
- !rcu_dereference_raw(rxqueue->rps_flow_table)) {
+ !rcu_access_pointer(rxqueue->rps_flow_table)) {
tcpu = map->cpus[0];
if (cpu_online(tcpu))
cpu = tcpu;
goto done;
}
- } else if (!rcu_dereference_raw(rxqueue->rps_flow_table)) {
+ } else if (!rcu_access_pointer(rxqueue->rps_flow_table)) {
goto done;
}
@@ -2884,12 +2933,11 @@ int netif_rx(struct sk_buff *skb)
if (netpoll_rx(skb))
return NET_RX_DROP;
- if (netdev_tstamp_prequeue)
- net_timestamp_check(skb);
+ net_timestamp_check(netdev_tstamp_prequeue, skb);
trace_netif_rx(skb);
#ifdef CONFIG_RPS
- {
+ if (static_branch(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu;
@@ -2904,14 +2952,13 @@ int netif_rx(struct sk_buff *skb)
rcu_read_unlock();
preempt_enable();
- }
-#else
+ } else
+#endif
{
unsigned int qtail;
ret = enqueue_to_backlog(skb, get_cpu(), &qtail);
put_cpu();
}
-#endif
return ret;
}
EXPORT_SYMBOL(netif_rx);
@@ -3102,8 +3149,8 @@ void netdev_rx_handler_unregister(struct net_device *dev)
{
ASSERT_RTNL();
- rcu_assign_pointer(dev->rx_handler, NULL);
- rcu_assign_pointer(dev->rx_handler_data, NULL);
+ RCU_INIT_POINTER(dev->rx_handler, NULL);
+ RCU_INIT_POINTER(dev->rx_handler_data, NULL);
}
EXPORT_SYMBOL_GPL(netdev_rx_handler_unregister);
@@ -3117,8 +3164,7 @@ static int __netif_receive_skb(struct sk_buff *skb)
int ret = NET_RX_DROP;
__be16 type;
- if (!netdev_tstamp_prequeue)
- net_timestamp_check(skb);
+ net_timestamp_check(!netdev_tstamp_prequeue, skb);
trace_netif_receive_skb(skb);
@@ -3171,6 +3217,17 @@ ncls:
#endif
rx_handler = rcu_dereference(skb->dev->rx_handler);
+ if (vlan_tx_tag_present(skb)) {
+ if (pt_prev) {
+ ret = deliver_skb(skb, pt_prev, orig_dev);
+ pt_prev = NULL;
+ }
+ if (vlan_do_receive(&skb, !rx_handler))
+ goto another_round;
+ else if (unlikely(!skb))
+ goto out;
+ }
+
if (rx_handler) {
if (pt_prev) {
ret = deliver_skb(skb, pt_prev, orig_dev);
@@ -3190,18 +3247,6 @@ ncls:
}
}
- if (vlan_tx_tag_present(skb)) {
- if (pt_prev) {
- ret = deliver_skb(skb, pt_prev, orig_dev);
- pt_prev = NULL;
- }
- if (vlan_do_receive(&skb)) {
- ret = __netif_receive_skb(skb);
- goto out;
- } else if (unlikely(!skb))
- goto out;
- }
-
/* deliver only exact match when indicated */
null_or_dev = deliver_exact ? skb->dev : NULL;
@@ -3250,14 +3295,13 @@ out:
*/
int netif_receive_skb(struct sk_buff *skb)
{
- if (netdev_tstamp_prequeue)
- net_timestamp_check(skb);
+ net_timestamp_check(netdev_tstamp_prequeue, skb);
if (skb_defer_rx_timestamp(skb))
return NET_RX_SUCCESS;
#ifdef CONFIG_RPS
- {
+ if (static_branch(&rps_needed)) {
struct rps_dev_flow voidflow, *rflow = &voidflow;
int cpu, ret;
@@ -3268,16 +3312,12 @@ int netif_receive_skb(struct sk_buff *skb)
if (cpu >= 0) {
ret = enqueue_to_backlog(skb, cpu, &rflow->last_qtail);
rcu_read_unlock();
- } else {
- rcu_read_unlock();
- ret = __netif_receive_skb(skb);
+ return ret;
}
-
- return ret;
+ rcu_read_unlock();
}
-#else
- return __netif_receive_skb(skb);
#endif
+ return __netif_receive_skb(skb);
}
EXPORT_SYMBOL(netif_receive_skb);
@@ -3429,10 +3469,10 @@ pull:
skb->data_len -= grow;
skb_shinfo(skb)->frags[0].page_offset += grow;
- skb_shinfo(skb)->frags[0].size -= grow;
+ skb_frag_size_sub(&skb_shinfo(skb)->frags[0], grow);
- if (unlikely(!skb_shinfo(skb)->frags[0].size)) {
- put_page(skb_shinfo(skb)->frags[0].page);
+ if (unlikely(!skb_frag_size(&skb_shinfo(skb)->frags[0]))) {
+ skb_frag_unref(skb, 0);
memmove(skb_shinfo(skb)->frags,
skb_shinfo(skb)->frags + 1,
--skb_shinfo(skb)->nr_frags * sizeof(skb_frag_t));
@@ -3496,11 +3536,10 @@ void skb_gro_reset_offset(struct sk_buff *skb)
NAPI_GRO_CB(skb)->frag0_len = 0;
if (skb->mac_header == skb->tail &&
- !PageHighMem(skb_shinfo(skb)->frags[0].page)) {
+ !PageHighMem(skb_frag_page(&skb_shinfo(skb)->frags[0]))) {
NAPI_GRO_CB(skb)->frag0 =
- page_address(skb_shinfo(skb)->frags[0].page) +
- skb_shinfo(skb)->frags[0].page_offset;
- NAPI_GRO_CB(skb)->frag0_len = skb_shinfo(skb)->frags[0].size;
+ skb_frag_address(&skb_shinfo(skb)->frags[0]);
+ NAPI_GRO_CB(skb)->frag0_len = skb_frag_size(&skb_shinfo(skb)->frags[0]);
}
}
EXPORT_SYMBOL(skb_gro_reset_offset);
@@ -3982,6 +4021,60 @@ static int dev_ifconf(struct net *net, char __user *arg)
}
#ifdef CONFIG_PROC_FS
+
+#define BUCKET_SPACE (32 - NETDEV_HASHBITS)
+
+struct dev_iter_state {
+ struct seq_net_private p;
+ unsigned int pos; /* bucket << BUCKET_SPACE + offset */
+};
+
+#define get_bucket(x) ((x) >> BUCKET_SPACE)
+#define get_offset(x) ((x) & ((1 << BUCKET_SPACE) - 1))
+#define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
+
+static inline struct net_device *dev_from_same_bucket(struct seq_file *seq)
+{
+ struct dev_iter_state *state = seq->private;
+ struct net *net = seq_file_net(seq);
+ struct net_device *dev;
+ struct hlist_node *p;
+ struct hlist_head *h;
+ unsigned int count, bucket, offset;
+
+ bucket = get_bucket(state->pos);
+ offset = get_offset(state->pos);
+ h = &net->dev_name_head[bucket];
+ count = 0;
+ hlist_for_each_entry_rcu(dev, p, h, name_hlist) {
+ if (count++ == offset) {
+ state->pos = set_bucket_offset(bucket, count);
+ return dev;
+ }
+ }
+
+ return NULL;
+}
+
+static inline struct net_device *dev_from_new_bucket(struct seq_file *seq)
+{
+ struct dev_iter_state *state = seq->private;
+ struct net_device *dev;
+ unsigned int bucket;
+
+ bucket = get_bucket(state->pos);
+ do {
+ dev = dev_from_same_bucket(seq);
+ if (dev)
+ return dev;
+
+ bucket++;
+ state->pos = set_bucket_offset(bucket, 0);
+ } while (bucket < NETDEV_HASHENTRIES);
+
+ return NULL;
+}
+
/*
* This is invoked by the /proc filesystem handler to display a device
* in detail.
@@ -3989,33 +4082,33 @@ static int dev_ifconf(struct net *net, char __user *arg)
void *dev_seq_start(struct seq_file *seq, loff_t *pos)
__acquires(RCU)
{
- struct net *net = seq_file_net(seq);
- loff_t off;
- struct net_device *dev;
+ struct dev_iter_state *state = seq->private;
rcu_read_lock();
if (!*pos)
return SEQ_START_TOKEN;
- off = 1;
- for_each_netdev_rcu(net, dev)
- if (off++ == *pos)
- return dev;
+ /* check for end of the hash */
+ if (state->pos == 0 && *pos > 1)
+ return NULL;
- return NULL;
+ return dev_from_new_bucket(seq);
}
void *dev_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- struct net_device *dev = v;
+ struct net_device *dev;
+
+ ++*pos;
if (v == SEQ_START_TOKEN)
- dev = first_net_device_rcu(seq_file_net(seq));
- else
- dev = next_net_device_rcu(dev);
+ return dev_from_new_bucket(seq);
- ++*pos;
- return dev;
+ dev = dev_from_same_bucket(seq);
+ if (dev)
+ return dev;
+
+ return dev_from_new_bucket(seq);
}
void dev_seq_stop(struct seq_file *seq, void *v)
@@ -4114,7 +4207,13 @@ static const struct seq_operations dev_seq_ops = {
static int dev_seq_open(struct inode *inode, struct file *file)
{
return seq_open_net(inode, file, &dev_seq_ops,
- sizeof(struct seq_net_private));
+ sizeof(struct dev_iter_state));
+}
+
+int dev_seq_open_ops(struct inode *inode, struct file *file,
+ const struct seq_operations *ops)
+{
+ return seq_open_net(inode, file, ops, sizeof(struct dev_iter_state));
}
static const struct file_operations dev_seq_fops = {
@@ -4367,7 +4466,7 @@ static void dev_change_rx_flags(struct net_device *dev, int flags)
static int __dev_set_promiscuity(struct net_device *dev, int inc)
{
- unsigned short old_flags = dev->flags;
+ unsigned int old_flags = dev->flags;
uid_t uid;
gid_t gid;
@@ -4424,7 +4523,7 @@ static int __dev_set_promiscuity(struct net_device *dev, int inc)
*/
int dev_set_promiscuity(struct net_device *dev, int inc)
{
- unsigned short old_flags = dev->flags;
+ unsigned int old_flags = dev->flags;
int err;
err = __dev_set_promiscuity(dev, inc);
@@ -4451,7 +4550,7 @@ EXPORT_SYMBOL(dev_set_promiscuity);
int dev_set_allmulti(struct net_device *dev, int inc)
{
- unsigned short old_flags = dev->flags;
+ unsigned int old_flags = dev->flags;
ASSERT_RTNL();
@@ -4497,9 +4596,7 @@ void __dev_set_rx_mode(struct net_device *dev)
if (!netif_device_present(dev))
return;
- if (ops->ndo_set_rx_mode)
- ops->ndo_set_rx_mode(dev);
- else {
+ if (!(dev->priv_flags & IFF_UNICAST_FLT)) {
/* Unicast addresses changes may only happen under the rtnl,
* therefore calling __dev_set_promiscuity here is safe.
*/
@@ -4510,10 +4607,10 @@ void __dev_set_rx_mode(struct net_device *dev)
__dev_set_promiscuity(dev, -1);
dev->uc_promisc = false;
}
-
- if (ops->ndo_set_multicast_list)
- ops->ndo_set_multicast_list(dev);
}
+
+ if (ops->ndo_set_rx_mode)
+ ops->ndo_set_rx_mode(dev);
}
void dev_set_rx_mode(struct net_device *dev)
@@ -4524,30 +4621,6 @@ void dev_set_rx_mode(struct net_device *dev)
}
/**
- * dev_ethtool_get_settings - call device's ethtool_ops::get_settings()
- * @dev: device
- * @cmd: memory area for ethtool_ops::get_settings() result
- *
- * The cmd arg is initialized properly (cleared and
- * ethtool_cmd::cmd field set to ETHTOOL_GSET).
- *
- * Return device's ethtool_ops::get_settings() result value or
- * -EOPNOTSUPP when device doesn't expose
- * ethtool_ops::get_settings() operation.
- */
-int dev_ethtool_get_settings(struct net_device *dev,
- struct ethtool_cmd *cmd)
-{
- if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings)
- return -EOPNOTSUPP;
-
- memset(cmd, 0, sizeof(struct ethtool_cmd));
- cmd->cmd = ETHTOOL_GSET;
- return dev->ethtool_ops->get_settings(dev, cmd);
-}
-EXPORT_SYMBOL(dev_ethtool_get_settings);
-
-/**
* dev_get_flags - get flags reported to userspace
* @dev: device
*
@@ -4580,7 +4653,7 @@ EXPORT_SYMBOL(dev_get_flags);
int __dev_change_flags(struct net_device *dev, unsigned int flags)
{
- int old_flags = dev->flags;
+ unsigned int old_flags = dev->flags;
int ret;
ASSERT_RTNL();
@@ -4663,10 +4736,10 @@ void __dev_notify_flags(struct net_device *dev, unsigned int old_flags)
* Change settings on device based state flags. The flags are
* in the userspace exported format.
*/
-int dev_change_flags(struct net_device *dev, unsigned flags)
+int dev_change_flags(struct net_device *dev, unsigned int flags)
{
- int ret, changes;
- int old_flags = dev->flags;
+ int ret;
+ unsigned int changes, old_flags = dev->flags;
ret = __dev_change_flags(dev, flags);
if (ret < 0)
@@ -4863,7 +4936,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
return -EOPNOTSUPP;
case SIOCADDMULTI:
- if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
+ if (!ops->ndo_set_rx_mode ||
ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
return -EINVAL;
if (!netif_device_present(dev))
@@ -4871,7 +4944,7 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
return dev_mc_add_global(dev, ifr->ifr_hwaddr.sa_data);
case SIOCDELMULTI:
- if ((!ops->ndo_set_multicast_list && !ops->ndo_set_rx_mode) ||
+ if (!ops->ndo_set_rx_mode ||
ifr->ifr_hwaddr.sa_family != AF_UNSPEC)
return -EINVAL;
if (!netif_device_present(dev))
@@ -4888,6 +4961,12 @@ static int dev_ifsioc(struct net *net, struct ifreq *ifr, unsigned int cmd)
ifr->ifr_newname[IFNAMSIZ-1] = '\0';
return dev_change_name(dev, ifr->ifr_newname);
+ case SIOCSHWTSTAMP:
+ err = net_hwtstamp_validate(ifr);
+ if (err)
+ return err;
+ /* fall through */
+
/*
* Unknown or private ioctl
*/
@@ -5202,7 +5281,7 @@ static void rollback_registered_many(struct list_head *head)
dev = list_first_entry(head, struct net_device, unreg_list);
call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
- rcu_barrier();
+ synchronize_net();
list_for_each_entry(dev, head, unreg_list)
dev_put(dev);
@@ -5217,7 +5296,8 @@ static void rollback_registered(struct net_device *dev)
list_del(&single);
}
-static u32 netdev_fix_features(struct net_device *dev, u32 features)
+static netdev_features_t netdev_fix_features(struct net_device *dev,
+ netdev_features_t features)
{
/* Fix illegal checksum combinations */
if ((features & NETIF_F_HW_CSUM) &&
@@ -5226,12 +5306,6 @@ static u32 netdev_fix_features(struct net_device *dev, u32 features)
features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM);
}
- if ((features & NETIF_F_NO_CSUM) &&
- (features & (NETIF_F_HW_CSUM|NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
- netdev_warn(dev, "mixed no checksumming and other settings.\n");
- features &= ~(NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM|NETIF_F_HW_CSUM);
- }
-
/* Fix illegal SG+CSUM combinations. */
if ((features & NETIF_F_SG) &&
!(features & NETIF_F_ALL_CSUM)) {
@@ -5279,7 +5353,7 @@ static u32 netdev_fix_features(struct net_device *dev, u32 features)
int __netdev_update_features(struct net_device *dev)
{
- u32 features;
+ netdev_features_t features;
int err = 0;
ASSERT_RTNL();
@@ -5295,16 +5369,16 @@ int __netdev_update_features(struct net_device *dev)
if (dev->features == features)
return 0;
- netdev_dbg(dev, "Features changed: 0x%08x -> 0x%08x\n",
- dev->features, features);
+ netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
+ &dev->features, &features);
if (dev->netdev_ops->ndo_set_features)
err = dev->netdev_ops->ndo_set_features(dev, features);
if (unlikely(err < 0)) {
netdev_err(dev,
- "set_features() failed (%d); wanted 0x%08x, left 0x%08x\n",
- err, features, dev->features);
+ "set_features() failed (%d); wanted %pNF, left %pNF\n",
+ err, &features, &dev->features);
return -1;
}
@@ -5403,6 +5477,9 @@ static void netdev_init_one_queue(struct net_device *dev,
queue->xmit_lock_owner = -1;
netdev_queue_numa_node_write(queue, NUMA_NO_NODE);
queue->dev = dev;
+#ifdef CONFIG_BQL
+ dql_init(&queue->dql, HZ);
+#endif
}
static int netif_alloc_netdev_queues(struct net_device *dev)
@@ -5488,11 +5565,12 @@ int register_netdevice(struct net_device *dev)
dev->wanted_features = dev->features & dev->hw_features;
/* Turn on no cache copy if HW is doing checksum */
- dev->hw_features |= NETIF_F_NOCACHE_COPY;
- if ((dev->features & NETIF_F_ALL_CSUM) &&
- !(dev->features & NETIF_F_NO_CSUM)) {
- dev->wanted_features |= NETIF_F_NOCACHE_COPY;
- dev->features |= NETIF_F_NOCACHE_COPY;
+ if (!(dev->flags & IFF_LOOPBACK)) {
+ dev->hw_features |= NETIF_F_NOCACHE_COPY;
+ if (dev->features & NETIF_F_ALL_CSUM) {
+ dev->wanted_features |= NETIF_F_NOCACHE_COPY;
+ dev->features |= NETIF_F_NOCACHE_COPY;
+ }
}
/* Make NETIF_F_HIGHDMA inheritable to VLAN devices.
@@ -5715,6 +5793,12 @@ void netdev_run_todo(void)
__rtnl_unlock();
+ /* Wait for rcu callbacks to finish before attempting to drain
+ * the device list. This usually avoids a 250ms wait.
+ */
+ if (!list_empty(&list))
+ rcu_barrier();
+
while (!list_empty(&list)) {
struct net_device *dev
= list_first_entry(&list, struct net_device, todo_list);
@@ -5735,8 +5819,8 @@ void netdev_run_todo(void)
/* paranoia */
BUG_ON(netdev_refcnt_read(dev));
- WARN_ON(rcu_dereference_raw(dev->ip_ptr));
- WARN_ON(rcu_dereference_raw(dev->ip6_ptr));
+ WARN_ON(rcu_access_pointer(dev->ip_ptr));
+ WARN_ON(rcu_access_pointer(dev->ip6_ptr));
WARN_ON(dev->dn_ptr);
if (dev->destructor)
@@ -5940,7 +6024,7 @@ void free_netdev(struct net_device *dev)
kfree(dev->_rx);
#endif
- kfree(rcu_dereference_raw(dev->ingress_queue));
+ kfree(rcu_dereference_protected(dev->ingress_queue, 1));
/* Flush device addresses */
dev_addr_flush(dev);
@@ -6115,6 +6199,7 @@ int dev_change_net_namespace(struct net_device *dev, struct net *net, const char
*/
call_netdevice_notifiers(NETDEV_UNREGISTER, dev);
call_netdevice_notifiers(NETDEV_UNREGISTER_BATCH, dev);
+ rtmsg_ifinfo(RTM_DELLINK, dev, ~0U);
/*
* Flush the unicast and multicast chains
@@ -6221,7 +6306,8 @@ static int dev_cpu_callback(struct notifier_block *nfb,
* @one to the master device with current feature set @all. Will not
* enable anything that is off in @mask. Returns the new feature set.
*/
-u32 netdev_increment_features(u32 all, u32 one, u32 mask)
+netdev_features_t netdev_increment_features(netdev_features_t all,
+ netdev_features_t one, netdev_features_t mask)
{
if (mask & NETIF_F_GEN_CSUM)
mask |= NETIF_F_ALL_CSUM;
@@ -6230,10 +6316,6 @@ u32 netdev_increment_features(u32 all, u32 one, u32 mask)
all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
all &= one | ~NETIF_F_ALL_FOR_ALL;
- /* If device needs checksumming, downgrade to it. */
- if (all & (NETIF_F_ALL_CSUM & ~NETIF_F_NO_CSUM))
- all &= ~NETIF_F_NO_CSUM;
-
/* If one device supports hw checksumming, set for all. */
if (all & NETIF_F_GEN_CSUM)
all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
@@ -6298,7 +6380,7 @@ const char *netdev_drivername(const struct net_device *dev)
return empty;
}
-static int __netdev_printk(const char *level, const struct net_device *dev,
+int __netdev_printk(const char *level, const struct net_device *dev,
struct va_format *vaf)
{
int r;
@@ -6313,6 +6395,7 @@ static int __netdev_printk(const char *level, const struct net_device *dev,
return r;
}
+EXPORT_SYMBOL(__netdev_printk);
int netdev_printk(const char *level, const struct net_device *dev,
const char *format, ...)
diff --git a/net/core/dev_addr_lists.c b/net/core/dev_addr_lists.c
index e2e66939ed0..29c07fef922 100644
--- a/net/core/dev_addr_lists.c
+++ b/net/core/dev_addr_lists.c
@@ -13,6 +13,7 @@
#include <linux/netdevice.h>
#include <linux/rtnetlink.h>
+#include <linux/export.h>
#include <linux/list.h>
#include <linux/proc_fs.h>
@@ -426,7 +427,7 @@ EXPORT_SYMBOL(dev_uc_del);
*
* Add newly added addresses to the destination device and release
* addresses that have no users left. The source device must be
- * locked by netif_tx_lock_bh.
+ * locked by netif_addr_lock_bh.
*
* This function is intended to be called from the dev->set_rx_mode
* function of layered software devices.
@@ -438,11 +439,11 @@ int dev_uc_sync(struct net_device *to, struct net_device *from)
if (to->addr_len != from->addr_len)
return -EINVAL;
- netif_addr_lock_bh(to);
+ netif_addr_lock_nested(to);
err = __hw_addr_sync(&to->uc, &from->uc, to->addr_len);
if (!err)
__dev_set_rx_mode(to);
- netif_addr_unlock_bh(to);
+ netif_addr_unlock(to);
return err;
}
EXPORT_SYMBOL(dev_uc_sync);
@@ -462,7 +463,7 @@ void dev_uc_unsync(struct net_device *to, struct net_device *from)
return;
netif_addr_lock_bh(from);
- netif_addr_lock(to);
+ netif_addr_lock_nested(to);
__hw_addr_unsync(&to->uc, &from->uc, to->addr_len);
__dev_set_rx_mode(to);
netif_addr_unlock(to);
@@ -589,10 +590,10 @@ EXPORT_SYMBOL(dev_mc_del_global);
*
* Add newly added addresses to the destination device and release
* addresses that have no users left. The source device must be
- * locked by netif_tx_lock_bh.
+ * locked by netif_addr_lock_bh.
*
- * This function is intended to be called from the dev->set_multicast_list
- * or dev->set_rx_mode function of layered software devices.
+ * This function is intended to be called from the ndo_set_rx_mode
+ * function of layered software devices.
*/
int dev_mc_sync(struct net_device *to, struct net_device *from)
{
@@ -601,11 +602,11 @@ int dev_mc_sync(struct net_device *to, struct net_device *from)
if (to->addr_len != from->addr_len)
return -EINVAL;
- netif_addr_lock_bh(to);
+ netif_addr_lock_nested(to);
err = __hw_addr_sync(&to->mc, &from->mc, to->addr_len);
if (!err)
__dev_set_rx_mode(to);
- netif_addr_unlock_bh(to);
+ netif_addr_unlock(to);
return err;
}
EXPORT_SYMBOL(dev_mc_sync);
@@ -625,7 +626,7 @@ void dev_mc_unsync(struct net_device *to, struct net_device *from)
return;
netif_addr_lock_bh(from);
- netif_addr_lock(to);
+ netif_addr_lock_nested(to);
__hw_addr_unsync(&to->mc, &from->mc, to->addr_len);
__dev_set_rx_mode(to);
netif_addr_unlock(to);
@@ -695,8 +696,7 @@ static const struct seq_operations dev_mc_seq_ops = {
static int dev_mc_seq_open(struct inode *inode, struct file *file)
{
- return seq_open_net(inode, file, &dev_mc_seq_ops,
- sizeof(struct seq_net_private));
+ return dev_seq_open_ops(inode, file, &dev_mc_seq_ops);
}
static const struct file_operations dev_mc_seq_fops = {
diff --git a/net/core/dst.c b/net/core/dst.c
index 14b33baf073..43d94cedbf7 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -171,7 +171,7 @@ void *dst_alloc(struct dst_ops *ops, struct net_device *dev,
dst_init_metrics(dst, dst_default_metrics, true);
dst->expires = 0UL;
dst->path = dst;
- dst->_neighbour = NULL;
+ RCU_INIT_POINTER(dst->_neighbour, NULL);
#ifdef CONFIG_XFRM
dst->xfrm = NULL;
#endif
@@ -229,11 +229,11 @@ struct dst_entry *dst_destroy(struct dst_entry * dst)
smp_rmb();
again:
- neigh = dst->_neighbour;
+ neigh = rcu_dereference_protected(dst->_neighbour, 1);
child = dst->child;
if (neigh) {
- dst->_neighbour = NULL;
+ RCU_INIT_POINTER(dst->_neighbour, NULL);
neigh_release(neigh);
}
@@ -360,14 +360,19 @@ static void dst_ifdown(struct dst_entry *dst, struct net_device *dev,
if (!unregister) {
dst->input = dst->output = dst_discard;
} else {
+ struct neighbour *neigh;
+
dst->dev = dev_net(dst->dev)->loopback_dev;
dev_hold(dst->dev);
dev_put(dev);
- if (dst->_neighbour && dst->_neighbour->dev == dev) {
- dst->_neighbour->dev = dst->dev;
+ rcu_read_lock();
+ neigh = dst_get_neighbour_noref(dst);
+ if (neigh && neigh->dev == dev) {
+ neigh->dev = dst->dev;
dev_hold(dst->dev);
dev_put(dev);
}
+ rcu_read_unlock();
}
}
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 6cdba5fc2be..921aa2b4b41 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -36,235 +36,44 @@ u32 ethtool_op_get_link(struct net_device *dev)
}
EXPORT_SYMBOL(ethtool_op_get_link);
-u32 ethtool_op_get_tx_csum(struct net_device *dev)
-{
- return (dev->features & NETIF_F_ALL_CSUM) != 0;
-}
-EXPORT_SYMBOL(ethtool_op_get_tx_csum);
-
-int ethtool_op_set_tx_csum(struct net_device *dev, u32 data)
-{
- if (data)
- dev->features |= NETIF_F_IP_CSUM;
- else
- dev->features &= ~NETIF_F_IP_CSUM;
-
- return 0;
-}
-EXPORT_SYMBOL(ethtool_op_set_tx_csum);
-
-int ethtool_op_set_tx_hw_csum(struct net_device *dev, u32 data)
-{
- if (data)
- dev->features |= NETIF_F_HW_CSUM;
- else
- dev->features &= ~NETIF_F_HW_CSUM;
-
- return 0;
-}
-EXPORT_SYMBOL(ethtool_op_set_tx_hw_csum);
-
-int ethtool_op_set_tx_ipv6_csum(struct net_device *dev, u32 data)
-{
- if (data)
- dev->features |= NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM;
- else
- dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM);
-
- return 0;
-}
-EXPORT_SYMBOL(ethtool_op_set_tx_ipv6_csum);
-
-u32 ethtool_op_get_sg(struct net_device *dev)
-{
- return (dev->features & NETIF_F_SG) != 0;
-}
-EXPORT_SYMBOL(ethtool_op_get_sg);
-
-int ethtool_op_set_sg(struct net_device *dev, u32 data)
-{
- if (data)
- dev->features |= NETIF_F_SG;
- else
- dev->features &= ~NETIF_F_SG;
-
- return 0;
-}
-EXPORT_SYMBOL(ethtool_op_set_sg);
-
-u32 ethtool_op_get_tso(struct net_device *dev)
-{
- return (dev->features & NETIF_F_TSO) != 0;
-}
-EXPORT_SYMBOL(ethtool_op_get_tso);
-
-int ethtool_op_set_tso(struct net_device *dev, u32 data)
-{
- if (data)
- dev->features |= NETIF_F_TSO;
- else
- dev->features &= ~NETIF_F_TSO;
-
- return 0;
-}
-EXPORT_SYMBOL(ethtool_op_set_tso);
-
-u32 ethtool_op_get_ufo(struct net_device *dev)
-{
- return (dev->features & NETIF_F_UFO) != 0;
-}
-EXPORT_SYMBOL(ethtool_op_get_ufo);
-
-int ethtool_op_set_ufo(struct net_device *dev, u32 data)
-{
- if (data)
- dev->features |= NETIF_F_UFO;
- else
- dev->features &= ~NETIF_F_UFO;
- return 0;
-}
-EXPORT_SYMBOL(ethtool_op_set_ufo);
-
-/* the following list of flags are the same as their associated
- * NETIF_F_xxx values in include/linux/netdevice.h
- */
-static const u32 flags_dup_features =
- (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | ETH_FLAG_NTUPLE |
- ETH_FLAG_RXHASH);
-
-u32 ethtool_op_get_flags(struct net_device *dev)
-{
- /* in the future, this function will probably contain additional
- * handling for flags which are not so easily handled
- * by a simple masking operation
- */
-
- return dev->features & flags_dup_features;
-}
-EXPORT_SYMBOL(ethtool_op_get_flags);
-
-/* Check if device can enable (or disable) particular feature coded in "data"
- * argument. Flags "supported" describe features that can be toggled by device.
- * If feature can not be toggled, it state (enabled or disabled) must match
- * hardcoded device features state, otherwise flags are marked as invalid.
- */
-bool ethtool_invalid_flags(struct net_device *dev, u32 data, u32 supported)
-{
- u32 features = dev->features & flags_dup_features;
- /* "data" can contain only flags_dup_features bits,
- * see __ethtool_set_flags */
-
- return (features & ~supported) != (data & ~supported);
-}
-EXPORT_SYMBOL(ethtool_invalid_flags);
-
-int ethtool_op_set_flags(struct net_device *dev, u32 data, u32 supported)
-{
- if (ethtool_invalid_flags(dev, data, supported))
- return -EINVAL;
-
- dev->features = ((dev->features & ~flags_dup_features) |
- (data & flags_dup_features));
- return 0;
-}
-EXPORT_SYMBOL(ethtool_op_set_flags);
-
/* Handlers for each ethtool command */
-#define ETHTOOL_DEV_FEATURE_WORDS 1
-
-static void ethtool_get_features_compat(struct net_device *dev,
- struct ethtool_get_features_block *features)
-{
- if (!dev->ethtool_ops)
- return;
-
- /* getting RX checksum */
- if (dev->ethtool_ops->get_rx_csum)
- if (dev->ethtool_ops->get_rx_csum(dev))
- features[0].active |= NETIF_F_RXCSUM;
-
- /* mark legacy-changeable features */
- if (dev->ethtool_ops->set_sg)
- features[0].available |= NETIF_F_SG;
- if (dev->ethtool_ops->set_tx_csum)
- features[0].available |= NETIF_F_ALL_CSUM;
- if (dev->ethtool_ops->set_tso)
- features[0].available |= NETIF_F_ALL_TSO;
- if (dev->ethtool_ops->set_rx_csum)
- features[0].available |= NETIF_F_RXCSUM;
- if (dev->ethtool_ops->set_flags)
- features[0].available |= flags_dup_features;
-}
-
-static int ethtool_set_feature_compat(struct net_device *dev,
- int (*legacy_set)(struct net_device *, u32),
- struct ethtool_set_features_block *features, u32 mask)
-{
- u32 do_set;
-
- if (!legacy_set)
- return 0;
-
- if (!(features[0].valid & mask))
- return 0;
-
- features[0].valid &= ~mask;
-
- do_set = !!(features[0].requested & mask);
-
- if (legacy_set(dev, do_set) < 0)
- netdev_info(dev,
- "Legacy feature change (%s) failed for 0x%08x\n",
- do_set ? "set" : "clear", mask);
-
- return 1;
-}
-
-static int ethtool_set_flags_compat(struct net_device *dev,
- int (*legacy_set)(struct net_device *, u32),
- struct ethtool_set_features_block *features, u32 mask)
-{
- u32 value;
-
- if (!legacy_set)
- return 0;
-
- if (!(features[0].valid & mask))
- return 0;
-
- value = dev->features & ~features[0].valid;
- value |= features[0].requested;
-
- features[0].valid &= ~mask;
-
- if (legacy_set(dev, value & mask) < 0)
- netdev_info(dev, "Legacy flags change failed\n");
-
- return 1;
-}
-
-static int ethtool_set_features_compat(struct net_device *dev,
- struct ethtool_set_features_block *features)
-{
- int compat;
-
- if (!dev->ethtool_ops)
- return 0;
-
- compat = ethtool_set_feature_compat(dev, dev->ethtool_ops->set_sg,
- features, NETIF_F_SG);
- compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tx_csum,
- features, NETIF_F_ALL_CSUM);
- compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_tso,
- features, NETIF_F_ALL_TSO);
- compat |= ethtool_set_feature_compat(dev, dev->ethtool_ops->set_rx_csum,
- features, NETIF_F_RXCSUM);
- compat |= ethtool_set_flags_compat(dev, dev->ethtool_ops->set_flags,
- features, flags_dup_features);
-
- return compat;
-}
+#define ETHTOOL_DEV_FEATURE_WORDS ((NETDEV_FEATURE_COUNT + 31) / 32)
+
+static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN] = {
+ [NETIF_F_SG_BIT] = "tx-scatter-gather",
+ [NETIF_F_IP_CSUM_BIT] = "tx-checksum-ipv4",
+ [NETIF_F_HW_CSUM_BIT] = "tx-checksum-ip-generic",
+ [NETIF_F_IPV6_CSUM_BIT] = "tx-checksum-ipv6",
+ [NETIF_F_HIGHDMA_BIT] = "highdma",
+ [NETIF_F_FRAGLIST_BIT] = "tx-scatter-gather-fraglist",
+ [NETIF_F_HW_VLAN_TX_BIT] = "tx-vlan-hw-insert",
+
+ [NETIF_F_HW_VLAN_RX_BIT] = "rx-vlan-hw-parse",
+ [NETIF_F_HW_VLAN_FILTER_BIT] = "rx-vlan-filter",
+ [NETIF_F_VLAN_CHALLENGED_BIT] = "vlan-challenged",
+ [NETIF_F_GSO_BIT] = "tx-generic-segmentation",
+ [NETIF_F_LLTX_BIT] = "tx-lockless",
+ [NETIF_F_NETNS_LOCAL_BIT] = "netns-local",
+ [NETIF_F_GRO_BIT] = "rx-gro",
+ [NETIF_F_LRO_BIT] = "rx-lro",
+
+ [NETIF_F_TSO_BIT] = "tx-tcp-segmentation",
+ [NETIF_F_UFO_BIT] = "tx-udp-fragmentation",
+ [NETIF_F_GSO_ROBUST_BIT] = "tx-gso-robust",
+ [NETIF_F_TSO_ECN_BIT] = "tx-tcp-ecn-segmentation",
+ [NETIF_F_TSO6_BIT] = "tx-tcp6-segmentation",
+ [NETIF_F_FSO_BIT] = "tx-fcoe-segmentation",
+
+ [NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
+ [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp",
+ [NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu",
+ [NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter",
+ [NETIF_F_RXHASH_BIT] = "rx-hashing",
+ [NETIF_F_RXCSUM_BIT] = "rx-checksum",
+ [NETIF_F_NOCACHE_COPY_BIT] = "tx-nocache-copy",
+ [NETIF_F_LOOPBACK_BIT] = "loopback",
+};
static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
{
@@ -272,18 +81,21 @@ static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
.cmd = ETHTOOL_GFEATURES,
.size = ETHTOOL_DEV_FEATURE_WORDS,
};
- struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS] = {
- {
- .available = dev->hw_features,
- .requested = dev->wanted_features,
- .active = dev->features,
- .never_changed = NETIF_F_NEVER_CHANGE,
- },
- };
+ struct ethtool_get_features_block features[ETHTOOL_DEV_FEATURE_WORDS];
u32 __user *sizeaddr;
u32 copy_size;
+ int i;
+
+ /* in case feature bits run out again */
+ BUILD_BUG_ON(ETHTOOL_DEV_FEATURE_WORDS * sizeof(u32) > sizeof(netdev_features_t));
- ethtool_get_features_compat(dev, features);
+ for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) {
+ features[i].available = (u32)(dev->hw_features >> (32 * i));
+ features[i].requested = (u32)(dev->wanted_features >> (32 * i));
+ features[i].active = (u32)(dev->features >> (32 * i));
+ features[i].never_changed =
+ (u32)(NETIF_F_NEVER_CHANGE >> (32 * i));
+ }
sizeaddr = useraddr + offsetof(struct ethtool_gfeatures, size);
if (get_user(copy_size, sizeaddr))
@@ -305,7 +117,8 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
{
struct ethtool_sfeatures cmd;
struct ethtool_set_features_block features[ETHTOOL_DEV_FEATURE_WORDS];
- int ret = 0;
+ netdev_features_t wanted = 0, valid = 0;
+ int i, ret = 0;
if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
return -EFAULT;
@@ -317,65 +130,29 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
if (copy_from_user(features, useraddr, sizeof(features)))
return -EFAULT;
- if (features[0].valid & ~NETIF_F_ETHTOOL_BITS)
- return -EINVAL;
+ for (i = 0; i < ETHTOOL_DEV_FEATURE_WORDS; ++i) {
+ valid |= (netdev_features_t)features[i].valid << (32 * i);
+ wanted |= (netdev_features_t)features[i].requested << (32 * i);
+ }
- if (ethtool_set_features_compat(dev, features))
- ret |= ETHTOOL_F_COMPAT;
+ if (valid & ~NETIF_F_ETHTOOL_BITS)
+ return -EINVAL;
- if (features[0].valid & ~dev->hw_features) {
- features[0].valid &= dev->hw_features;
+ if (valid & ~dev->hw_features) {
+ valid &= dev->hw_features;
ret |= ETHTOOL_F_UNSUPPORTED;
}
- dev->wanted_features &= ~features[0].valid;
- dev->wanted_features |= features[0].valid & features[0].requested;
+ dev->wanted_features &= ~valid;
+ dev->wanted_features |= wanted & valid;
__netdev_update_features(dev);
- if ((dev->wanted_features ^ dev->features) & features[0].valid)
+ if ((dev->wanted_features ^ dev->features) & valid)
ret |= ETHTOOL_F_WISH;
return ret;
}
-static const char netdev_features_strings[ETHTOOL_DEV_FEATURE_WORDS * 32][ETH_GSTRING_LEN] = {
- /* NETIF_F_SG */ "tx-scatter-gather",
- /* NETIF_F_IP_CSUM */ "tx-checksum-ipv4",
- /* NETIF_F_NO_CSUM */ "tx-checksum-unneeded",
- /* NETIF_F_HW_CSUM */ "tx-checksum-ip-generic",
- /* NETIF_F_IPV6_CSUM */ "tx-checksum-ipv6",
- /* NETIF_F_HIGHDMA */ "highdma",
- /* NETIF_F_FRAGLIST */ "tx-scatter-gather-fraglist",
- /* NETIF_F_HW_VLAN_TX */ "tx-vlan-hw-insert",
-
- /* NETIF_F_HW_VLAN_RX */ "rx-vlan-hw-parse",
- /* NETIF_F_HW_VLAN_FILTER */ "rx-vlan-filter",
- /* NETIF_F_VLAN_CHALLENGED */ "vlan-challenged",
- /* NETIF_F_GSO */ "tx-generic-segmentation",
- /* NETIF_F_LLTX */ "tx-lockless",
- /* NETIF_F_NETNS_LOCAL */ "netns-local",
- /* NETIF_F_GRO */ "rx-gro",
- /* NETIF_F_LRO */ "rx-lro",
-
- /* NETIF_F_TSO */ "tx-tcp-segmentation",
- /* NETIF_F_UFO */ "tx-udp-fragmentation",
- /* NETIF_F_GSO_ROBUST */ "tx-gso-robust",
- /* NETIF_F_TSO_ECN */ "tx-tcp-ecn-segmentation",
- /* NETIF_F_TSO6 */ "tx-tcp6-segmentation",
- /* NETIF_F_FSO */ "tx-fcoe-segmentation",
- "",
- "",
-
- /* NETIF_F_FCOE_CRC */ "tx-checksum-fcoe-crc",
- /* NETIF_F_SCTP_CSUM */ "tx-checksum-sctp",
- /* NETIF_F_FCOE_MTU */ "fcoe-mtu",
- /* NETIF_F_NTUPLE */ "rx-ntuple-filter",
- /* NETIF_F_RXHASH */ "rx-hashing",
- /* NETIF_F_RXCSUM */ "rx-checksum",
- /* NETIF_F_NOCACHE_COPY */ "tx-nocache-copy",
- /* NETIF_F_LOOPBACK */ "loopback",
-};
-
static int __ethtool_get_sset_count(struct net_device *dev, int sset)
{
const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -402,7 +179,7 @@ static void __ethtool_get_strings(struct net_device *dev,
ops->get_strings(dev, stringset, data);
}
-static u32 ethtool_get_feature_mask(u32 eth_cmd)
+static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd)
{
/* feature masks of legacy discrete ethtool ops */
@@ -433,151 +210,107 @@ static u32 ethtool_get_feature_mask(u32 eth_cmd)
}
}
-static void *__ethtool_get_one_feature_actor(struct net_device *dev, u32 ethcmd)
-{
- const struct ethtool_ops *ops = dev->ethtool_ops;
-
- if (!ops)
- return NULL;
-
- switch (ethcmd) {
- case ETHTOOL_GTXCSUM:
- return ops->get_tx_csum;
- case ETHTOOL_GRXCSUM:
- return ops->get_rx_csum;
- case ETHTOOL_SSG:
- return ops->get_sg;
- case ETHTOOL_STSO:
- return ops->get_tso;
- case ETHTOOL_SUFO:
- return ops->get_ufo;
- default:
- return NULL;
- }
-}
-
-static u32 __ethtool_get_rx_csum_oldbug(struct net_device *dev)
-{
- return !!(dev->features & NETIF_F_ALL_CSUM);
-}
-
static int ethtool_get_one_feature(struct net_device *dev,
char __user *useraddr, u32 ethcmd)
{
- u32 mask = ethtool_get_feature_mask(ethcmd);
+ netdev_features_t mask = ethtool_get_feature_mask(ethcmd);
struct ethtool_value edata = {
.cmd = ethcmd,
.data = !!(dev->features & mask),
};
- /* compatibility with discrete get_ ops */
- if (!(dev->hw_features & mask)) {
- u32 (*actor)(struct net_device *);
-
- actor = __ethtool_get_one_feature_actor(dev, ethcmd);
-
- /* bug compatibility with old get_rx_csum */
- if (ethcmd == ETHTOOL_GRXCSUM && !actor)
- actor = __ethtool_get_rx_csum_oldbug;
-
- if (actor)
- edata.data = actor(dev);
- }
-
if (copy_to_user(useraddr, &edata, sizeof(edata)))
return -EFAULT;
return 0;
}
-static int __ethtool_set_tx_csum(struct net_device *dev, u32 data);
-static int __ethtool_set_rx_csum(struct net_device *dev, u32 data);
-static int __ethtool_set_sg(struct net_device *dev, u32 data);
-static int __ethtool_set_tso(struct net_device *dev, u32 data);
-static int __ethtool_set_ufo(struct net_device *dev, u32 data);
-
static int ethtool_set_one_feature(struct net_device *dev,
void __user *useraddr, u32 ethcmd)
{
struct ethtool_value edata;
- u32 mask;
+ netdev_features_t mask;
if (copy_from_user(&edata, useraddr, sizeof(edata)))
return -EFAULT;
mask = ethtool_get_feature_mask(ethcmd);
mask &= dev->hw_features;
- if (mask) {
- if (edata.data)
- dev->wanted_features |= mask;
- else
- dev->wanted_features &= ~mask;
+ if (!mask)
+ return -EOPNOTSUPP;
- __netdev_update_features(dev);
- return 0;
- }
+ if (edata.data)
+ dev->wanted_features |= mask;
+ else
+ dev->wanted_features &= ~mask;
- /* Driver is not converted to ndo_fix_features or does not
- * support changing this offload. In the latter case it won't
- * have corresponding ethtool_ops field set.
- *
- * Following part is to be removed after all drivers advertise
- * their changeable features in netdev->hw_features and stop
- * using discrete offload setting ops.
- */
+ __netdev_update_features(dev);
- switch (ethcmd) {
- case ETHTOOL_STXCSUM:
- return __ethtool_set_tx_csum(dev, edata.data);
- case ETHTOOL_SRXCSUM:
- return __ethtool_set_rx_csum(dev, edata.data);
- case ETHTOOL_SSG:
- return __ethtool_set_sg(dev, edata.data);
- case ETHTOOL_STSO:
- return __ethtool_set_tso(dev, edata.data);
- case ETHTOOL_SUFO:
- return __ethtool_set_ufo(dev, edata.data);
- default:
- return -EOPNOTSUPP;
- }
+ return 0;
}
-int __ethtool_set_flags(struct net_device *dev, u32 data)
+#define ETH_ALL_FLAGS (ETH_FLAG_LRO | ETH_FLAG_RXVLAN | ETH_FLAG_TXVLAN | \
+ ETH_FLAG_NTUPLE | ETH_FLAG_RXHASH)
+#define ETH_ALL_FEATURES (NETIF_F_LRO | NETIF_F_HW_VLAN_RX | \
+ NETIF_F_HW_VLAN_TX | NETIF_F_NTUPLE | NETIF_F_RXHASH)
+
+static u32 __ethtool_get_flags(struct net_device *dev)
{
- u32 changed;
+ u32 flags = 0;
+
+ if (dev->features & NETIF_F_LRO) flags |= ETH_FLAG_LRO;
+ if (dev->features & NETIF_F_HW_VLAN_RX) flags |= ETH_FLAG_RXVLAN;
+ if (dev->features & NETIF_F_HW_VLAN_TX) flags |= ETH_FLAG_TXVLAN;
+ if (dev->features & NETIF_F_NTUPLE) flags |= ETH_FLAG_NTUPLE;
+ if (dev->features & NETIF_F_RXHASH) flags |= ETH_FLAG_RXHASH;
- if (data & ~flags_dup_features)
+ return flags;
+}
+
+static int __ethtool_set_flags(struct net_device *dev, u32 data)
+{
+ netdev_features_t features = 0, changed;
+
+ if (data & ~ETH_ALL_FLAGS)
return -EINVAL;
- /* legacy set_flags() op */
- if (dev->ethtool_ops->set_flags) {
- if (unlikely(dev->hw_features & flags_dup_features))
- netdev_warn(dev,
- "driver BUG: mixed hw_features and set_flags()\n");
- return dev->ethtool_ops->set_flags(dev, data);
- }
+ if (data & ETH_FLAG_LRO) features |= NETIF_F_LRO;
+ if (data & ETH_FLAG_RXVLAN) features |= NETIF_F_HW_VLAN_RX;
+ if (data & ETH_FLAG_TXVLAN) features |= NETIF_F_HW_VLAN_TX;
+ if (data & ETH_FLAG_NTUPLE) features |= NETIF_F_NTUPLE;
+ if (data & ETH_FLAG_RXHASH) features |= NETIF_F_RXHASH;
/* allow changing only bits set in hw_features */
- changed = (data ^ dev->features) & flags_dup_features;
+ changed = (features ^ dev->features) & ETH_ALL_FEATURES;
if (changed & ~dev->hw_features)
return (changed & dev->hw_features) ? -EINVAL : -EOPNOTSUPP;
dev->wanted_features =
- (dev->wanted_features & ~changed) | (data & dev->hw_features);
+ (dev->wanted_features & ~changed) | (features & changed);
__netdev_update_features(dev);
return 0;
}
-static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
+int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
{
- struct ethtool_cmd cmd = { .cmd = ETHTOOL_GSET };
- int err;
+ ASSERT_RTNL();
- if (!dev->ethtool_ops->get_settings)
+ if (!dev->ethtool_ops || !dev->ethtool_ops->get_settings)
return -EOPNOTSUPP;
- err = dev->ethtool_ops->get_settings(dev, &cmd);
+ memset(cmd, 0, sizeof(struct ethtool_cmd));
+ cmd->cmd = ETHTOOL_GSET;
+ return dev->ethtool_ops->get_settings(dev, cmd);
+}
+EXPORT_SYMBOL(__ethtool_get_settings);
+
+static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
+{
+ int err;
+ struct ethtool_cmd cmd;
+
+ err = __ethtool_get_settings(dev, &cmd);
if (err < 0)
return err;
@@ -706,6 +439,7 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
{
struct ethtool_rxnfc info;
size_t info_size = sizeof(info);
+ int rc;
if (!dev->ethtool_ops->set_rxnfc)
return -EOPNOTSUPP;
@@ -721,7 +455,15 @@ static noinline_for_stack int ethtool_set_rxnfc(struct net_device *dev,
if (copy_from_user(&info, useraddr, info_size))
return -EFAULT;
- return dev->ethtool_ops->set_rxnfc(dev, &info);
+ rc = dev->ethtool_ops->set_rxnfc(dev, &info);
+ if (rc)
+ return rc;
+
+ if (cmd == ETHTOOL_SRXCLSRLINS &&
+ copy_to_user(useraddr, &info, info_size))
+ return -EFAULT;
+
+ return 0;
}
static noinline_for_stack int ethtool_get_rxnfc(struct net_device *dev,
@@ -782,34 +524,44 @@ err_out:
static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
void __user *useraddr)
{
- struct ethtool_rxfh_indir *indir;
- u32 table_size;
- size_t full_size;
+ u32 user_size, dev_size;
+ u32 *indir;
int ret;
- if (!dev->ethtool_ops->get_rxfh_indir)
+ if (!dev->ethtool_ops->get_rxfh_indir_size ||
+ !dev->ethtool_ops->get_rxfh_indir)
+ return -EOPNOTSUPP;
+ dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
+ if (dev_size == 0)
return -EOPNOTSUPP;
- if (copy_from_user(&table_size,
+ if (copy_from_user(&user_size,
useraddr + offsetof(struct ethtool_rxfh_indir, size),
- sizeof(table_size)))
+ sizeof(user_size)))
return -EFAULT;
- if (table_size >
- (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index))
- return -ENOMEM;
- full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size;
- indir = kzalloc(full_size, GFP_USER);
+ if (copy_to_user(useraddr + offsetof(struct ethtool_rxfh_indir, size),
+ &dev_size, sizeof(dev_size)))
+ return -EFAULT;
+
+ /* If the user buffer size is 0, this is just a query for the
+ * device table size. Otherwise, if it's smaller than the
+ * device table size it's an error.
+ */
+ if (user_size < dev_size)
+ return user_size == 0 ? 0 : -EINVAL;
+
+ indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
if (!indir)
return -ENOMEM;
- indir->cmd = ETHTOOL_GRXFHINDIR;
- indir->size = table_size;
ret = dev->ethtool_ops->get_rxfh_indir(dev, indir);
if (ret)
goto out;
- if (copy_to_user(useraddr, indir, full_size))
+ if (copy_to_user(useraddr +
+ offsetof(struct ethtool_rxfh_indir, ring_index[0]),
+ indir, dev_size * sizeof(indir[0])))
ret = -EFAULT;
out:
@@ -820,30 +572,56 @@ out:
static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
void __user *useraddr)
{
- struct ethtool_rxfh_indir *indir;
- u32 table_size;
- size_t full_size;
+ struct ethtool_rxnfc rx_rings;
+ u32 user_size, dev_size, i;
+ u32 *indir;
int ret;
- if (!dev->ethtool_ops->set_rxfh_indir)
+ if (!dev->ethtool_ops->get_rxfh_indir_size ||
+ !dev->ethtool_ops->set_rxfh_indir ||
+ !dev->ethtool_ops->get_rxnfc)
+ return -EOPNOTSUPP;
+ dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
+ if (dev_size == 0)
return -EOPNOTSUPP;
- if (copy_from_user(&table_size,
+ if (copy_from_user(&user_size,
useraddr + offsetof(struct ethtool_rxfh_indir, size),
- sizeof(table_size)))
+ sizeof(user_size)))
return -EFAULT;
- if (table_size >
- (KMALLOC_MAX_SIZE - sizeof(*indir)) / sizeof(*indir->ring_index))
- return -ENOMEM;
- full_size = sizeof(*indir) + sizeof(*indir->ring_index) * table_size;
- indir = kmalloc(full_size, GFP_USER);
+ if (user_size != 0 && user_size != dev_size)
+ return -EINVAL;
+
+ indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
if (!indir)
return -ENOMEM;
- if (copy_from_user(indir, useraddr, full_size)) {
- ret = -EFAULT;
+ rx_rings.cmd = ETHTOOL_GRXRINGS;
+ ret = dev->ethtool_ops->get_rxnfc(dev, &rx_rings, NULL);
+ if (ret)
goto out;
+
+ if (user_size == 0) {
+ for (i = 0; i < dev_size; i++)
+ indir[i] = ethtool_rxfh_indir_default(i, rx_rings.data);
+ } else {
+ if (copy_from_user(indir,
+ useraddr +
+ offsetof(struct ethtool_rxfh_indir,
+ ring_index[0]),
+ dev_size * sizeof(indir[0]))) {
+ ret = -EFAULT;
+ goto out;
+ }
+
+ /* Validate ring indices */
+ for (i = 0; i < dev_size; i++) {
+ if (indir[i] >= rx_rings.data) {
+ ret = -EINVAL;
+ goto out;
+ }
+ }
}
ret = dev->ethtool_ops->set_rxfh_indir(dev, indir);
@@ -853,58 +631,6 @@ out:
return ret;
}
-/*
- * ethtool does not (or did not) set masks for flow parameters that are
- * not specified, so if both value and mask are 0 then this must be
- * treated as equivalent to a mask with all bits set. Implement that
- * here rather than in drivers.
- */
-static void rx_ntuple_fix_masks(struct ethtool_rx_ntuple_flow_spec *fs)
-{
- struct ethtool_tcpip4_spec *entry = &fs->h_u.tcp_ip4_spec;
- struct ethtool_tcpip4_spec *mask = &fs->m_u.tcp_ip4_spec;
-
- if (fs->flow_type != TCP_V4_FLOW &&
- fs->flow_type != UDP_V4_FLOW &&
- fs->flow_type != SCTP_V4_FLOW)
- return;
-
- if (!(entry->ip4src | mask->ip4src))
- mask->ip4src = htonl(0xffffffff);
- if (!(entry->ip4dst | mask->ip4dst))
- mask->ip4dst = htonl(0xffffffff);
- if (!(entry->psrc | mask->psrc))
- mask->psrc = htons(0xffff);
- if (!(entry->pdst | mask->pdst))
- mask->pdst = htons(0xffff);
- if (!(entry->tos | mask->tos))
- mask->tos = 0xff;
- if (!(fs->vlan_tag | fs->vlan_tag_mask))
- fs->vlan_tag_mask = 0xffff;
- if (!(fs->data | fs->data_mask))
- fs->data_mask = 0xffffffffffffffffULL;
-}
-
-static noinline_for_stack int ethtool_set_rx_ntuple(struct net_device *dev,
- void __user *useraddr)
-{
- struct ethtool_rx_ntuple cmd;
- const struct ethtool_ops *ops = dev->ethtool_ops;
-
- if (!ops->set_rx_ntuple)
- return -EOPNOTSUPP;
-
- if (!(dev->features & NETIF_F_NTUPLE))
- return -EINVAL;
-
- if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
- return -EFAULT;
-
- rx_ntuple_fix_masks(&cmd.fs);
-
- return ops->set_rx_ntuple(dev, &cmd);
-}
-
static int ethtool_get_regs(struct net_device *dev, char __user *useraddr)
{
struct ethtool_regs regs;
@@ -1221,81 +947,6 @@ static int ethtool_set_pauseparam(struct net_device *dev, void __user *useraddr)
return dev->ethtool_ops->set_pauseparam(dev, &pauseparam);
}
-static int __ethtool_set_sg(struct net_device *dev, u32 data)
-{
- int err;
-
- if (!dev->ethtool_ops->set_sg)
- return -EOPNOTSUPP;
-
- if (data && !(dev->features & NETIF_F_ALL_CSUM))
- return -EINVAL;
-
- if (!data && dev->ethtool_ops->set_tso) {
- err = dev->ethtool_ops->set_tso(dev, 0);
- if (err)
- return err;
- }
-
- if (!data && dev->ethtool_ops->set_ufo) {
- err = dev->ethtool_ops->set_ufo(dev, 0);
- if (err)
- return err;
- }
- return dev->ethtool_ops->set_sg(dev, data);
-}
-
-static int __ethtool_set_tx_csum(struct net_device *dev, u32 data)
-{
- int err;
-
- if (!dev->ethtool_ops->set_tx_csum)
- return -EOPNOTSUPP;
-
- if (!data && dev->ethtool_ops->set_sg) {
- err = __ethtool_set_sg(dev, 0);
- if (err)
- return err;
- }
-
- return dev->ethtool_ops->set_tx_csum(dev, data);
-}
-
-static int __ethtool_set_rx_csum(struct net_device *dev, u32 data)
-{
- if (!dev->ethtool_ops->set_rx_csum)
- return -EOPNOTSUPP;
-
- if (!data)
- dev->features &= ~NETIF_F_GRO;
-
- return dev->ethtool_ops->set_rx_csum(dev, data);
-}
-
-static int __ethtool_set_tso(struct net_device *dev, u32 data)
-{
- if (!dev->ethtool_ops->set_tso)
- return -EOPNOTSUPP;
-
- if (data && !(dev->features & NETIF_F_SG))
- return -EINVAL;
-
- return dev->ethtool_ops->set_tso(dev, data);
-}
-
-static int __ethtool_set_ufo(struct net_device *dev, u32 data)
-{
- if (!dev->ethtool_ops->set_ufo)
- return -EOPNOTSUPP;
- if (data && !(dev->features & NETIF_F_SG))
- return -EINVAL;
- if (data && !((dev->features & NETIF_F_GEN_CSUM) ||
- (dev->features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
- == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM)))
- return -EINVAL;
- return dev->ethtool_ops->set_ufo(dev, data);
-}
-
static int ethtool_self_test(struct net_device *dev, char __user *useraddr)
{
struct ethtool_test test;
@@ -1761,9 +1412,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
break;
case ETHTOOL_GFLAGS:
rc = ethtool_get_value(dev, useraddr, ethcmd,
- (dev->ethtool_ops->get_flags ?
- dev->ethtool_ops->get_flags :
- ethtool_op_get_flags));
+ __ethtool_get_flags);
break;
case ETHTOOL_SFLAGS:
rc = ethtool_set_value(dev, useraddr, __ethtool_set_flags);
@@ -1794,9 +1443,6 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_RESET:
rc = ethtool_reset(dev, useraddr);
break;
- case ETHTOOL_SRXNTUPLE:
- rc = ethtool_set_rx_ntuple(dev, useraddr);
- break;
case ETHTOOL_GSSET_INFO:
rc = ethtool_get_sset_info(dev, useraddr);
break;
diff --git a/net/core/fib_rules.c b/net/core/fib_rules.c
index 27071ee2a4e..c02e63c908d 100644
--- a/net/core/fib_rules.c
+++ b/net/core/fib_rules.c
@@ -12,6 +12,7 @@
#include <linux/kernel.h>
#include <linux/slab.h>
#include <linux/list.h>
+#include <linux/module.h>
#include <net/net_namespace.h>
#include <net/sock.h>
#include <net/fib_rules.h>
@@ -490,7 +491,7 @@ static int fib_nl_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg)
if (ops->nr_goto_rules > 0) {
list_for_each_entry(tmp, &ops->rules_list, list) {
if (rtnl_dereference(tmp->ctarget) == rule) {
- rcu_assign_pointer(tmp->ctarget, NULL);
+ RCU_INIT_POINTER(tmp->ctarget, NULL);
ops->unresolved_rules++;
}
}
@@ -548,7 +549,7 @@ static int fib_nl_fill_rule(struct sk_buff *skb, struct fib_rule *rule,
frh->flags = rule->flags;
if (rule->action == FR_ACT_GOTO &&
- rcu_dereference_raw(rule->ctarget) == NULL)
+ rcu_access_pointer(rule->ctarget) == NULL)
frh->flags |= FIB_RULE_UNRESOLVED;
if (rule->iifname[0]) {
diff --git a/net/core/filter.c b/net/core/filter.c
index 36f975fa87c..5dea4527921 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -436,7 +436,7 @@ error:
*
* Returns 0 if the rule set is legal or -EINVAL if not.
*/
-int sk_chk_filter(struct sock_filter *filter, int flen)
+int sk_chk_filter(struct sock_filter *filter, unsigned int flen)
{
/*
* Valid instructions are initialized to non-0.
@@ -645,7 +645,7 @@ int sk_detach_filter(struct sock *sk)
filter = rcu_dereference_protected(sk->sk_filter,
sock_owned_by_user(sk));
if (filter) {
- rcu_assign_pointer(sk->sk_filter, NULL);
+ RCU_INIT_POINTER(sk->sk_filter, NULL);
sk_filter_uncharge(sk, filter);
ret = 0;
}
diff --git a/net/core/flow.c b/net/core/flow.c
index 555a456efb0..e318c7e9804 100644
--- a/net/core/flow.c
+++ b/net/core/flow.c
@@ -358,6 +358,18 @@ void flow_cache_flush(void)
put_online_cpus();
}
+static void flow_cache_flush_task(struct work_struct *work)
+{
+ flow_cache_flush();
+}
+
+static DECLARE_WORK(flow_cache_flush_work, flow_cache_flush_task);
+
+void flow_cache_flush_deferred(void)
+{
+ schedule_work(&flow_cache_flush_work);
+}
+
static int __cpuinit flow_cache_cpu_prepare(struct flow_cache *fc, int cpu)
{
struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, cpu);
@@ -413,7 +425,7 @@ static int __init flow_cache_init(struct flow_cache *fc)
for_each_online_cpu(i) {
if (flow_cache_cpu_prepare(fc, i))
- return -ENOMEM;
+ goto err;
}
fc->hotcpu_notifier = (struct notifier_block){
.notifier_call = flow_cache_cpu,
@@ -426,6 +438,18 @@ static int __init flow_cache_init(struct flow_cache *fc)
add_timer(&fc->rnd_timer);
return 0;
+
+err:
+ for_each_possible_cpu(i) {
+ struct flow_cache_percpu *fcp = per_cpu_ptr(fc->percpu, i);
+ kfree(fcp->hash_table);
+ fcp->hash_table = NULL;
+ }
+
+ free_percpu(fc->percpu);
+ fc->percpu = NULL;
+
+ return -ENOMEM;
}
static int __init flow_cache_init_global(void)
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
new file mode 100644
index 00000000000..0985b9b14b8
--- /dev/null
+++ b/net/core/flow_dissector.c
@@ -0,0 +1,143 @@
+#include <linux/skbuff.h>
+#include <linux/ip.h>
+#include <linux/ipv6.h>
+#include <linux/if_vlan.h>
+#include <net/ip.h>
+#include <linux/if_tunnel.h>
+#include <linux/if_pppox.h>
+#include <linux/ppp_defs.h>
+#include <net/flow_keys.h>
+
+/* copy saddr & daddr, possibly using 64bit load/store
+ * Equivalent to : flow->src = iph->saddr;
+ * flow->dst = iph->daddr;
+ */
+static void iph_to_flow_copy_addrs(struct flow_keys *flow, const struct iphdr *iph)
+{
+ BUILD_BUG_ON(offsetof(typeof(*flow), dst) !=
+ offsetof(typeof(*flow), src) + sizeof(flow->src));
+ memcpy(&flow->src, &iph->saddr, sizeof(flow->src) + sizeof(flow->dst));
+}
+
+bool skb_flow_dissect(const struct sk_buff *skb, struct flow_keys *flow)
+{
+ int poff, nhoff = skb_network_offset(skb);
+ u8 ip_proto;
+ __be16 proto = skb->protocol;
+
+ memset(flow, 0, sizeof(*flow));
+
+again:
+ switch (proto) {
+ case __constant_htons(ETH_P_IP): {
+ const struct iphdr *iph;
+ struct iphdr _iph;
+ip:
+ iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
+ if (!iph)
+ return false;
+
+ if (ip_is_fragment(iph))
+ ip_proto = 0;
+ else
+ ip_proto = iph->protocol;
+ iph_to_flow_copy_addrs(flow, iph);
+ nhoff += iph->ihl * 4;
+ break;
+ }
+ case __constant_htons(ETH_P_IPV6): {
+ const struct ipv6hdr *iph;
+ struct ipv6hdr _iph;
+ipv6:
+ iph = skb_header_pointer(skb, nhoff, sizeof(_iph), &_iph);
+ if (!iph)
+ return false;
+
+ ip_proto = iph->nexthdr;
+ flow->src = iph->saddr.s6_addr32[3];
+ flow->dst = iph->daddr.s6_addr32[3];
+ nhoff += sizeof(struct ipv6hdr);
+ break;
+ }
+ case __constant_htons(ETH_P_8021Q): {
+ const struct vlan_hdr *vlan;
+ struct vlan_hdr _vlan;
+
+ vlan = skb_header_pointer(skb, nhoff, sizeof(_vlan), &_vlan);
+ if (!vlan)
+ return false;
+
+ proto = vlan->h_vlan_encapsulated_proto;
+ nhoff += sizeof(*vlan);
+ goto again;
+ }
+ case __constant_htons(ETH_P_PPP_SES): {
+ struct {
+ struct pppoe_hdr hdr;
+ __be16 proto;
+ } *hdr, _hdr;
+ hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return false;
+ proto = hdr->proto;
+ nhoff += PPPOE_SES_HLEN;
+ switch (proto) {
+ case __constant_htons(PPP_IP):
+ goto ip;
+ case __constant_htons(PPP_IPV6):
+ goto ipv6;
+ default:
+ return false;
+ }
+ }
+ default:
+ return false;
+ }
+
+ switch (ip_proto) {
+ case IPPROTO_GRE: {
+ struct gre_hdr {
+ __be16 flags;
+ __be16 proto;
+ } *hdr, _hdr;
+
+ hdr = skb_header_pointer(skb, nhoff, sizeof(_hdr), &_hdr);
+ if (!hdr)
+ return false;
+ /*
+ * Only look inside GRE if version zero and no
+ * routing
+ */
+ if (!(hdr->flags & (GRE_VERSION|GRE_ROUTING))) {
+ proto = hdr->proto;
+ nhoff += 4;
+ if (hdr->flags & GRE_CSUM)
+ nhoff += 4;
+ if (hdr->flags & GRE_KEY)
+ nhoff += 4;
+ if (hdr->flags & GRE_SEQ)
+ nhoff += 4;
+ goto again;
+ }
+ break;
+ }
+ case IPPROTO_IPIP:
+ goto again;
+ default:
+ break;
+ }
+
+ flow->ip_proto = ip_proto;
+ poff = proto_ports_offset(ip_proto);
+ if (poff >= 0) {
+ __be32 *ports, _ports;
+
+ nhoff += poff;
+ ports = skb_header_pointer(skb, nhoff, sizeof(_ports), &_ports);
+ if (ports)
+ flow->ports = *ports;
+ }
+
+ return true;
+}
+EXPORT_SYMBOL(skb_flow_dissect);
diff --git a/net/core/kmap_skb.h b/net/core/kmap_skb.h
index 283c2b993fb..81e1ed7c838 100644
--- a/net/core/kmap_skb.h
+++ b/net/core/kmap_skb.h
@@ -7,7 +7,7 @@ static inline void *kmap_skb_frag(const skb_frag_t *frag)
local_bh_disable();
#endif
- return kmap_atomic(frag->page, KM_SKB_DATA_SOFTIRQ);
+ return kmap_atomic(skb_frag_page(frag), KM_SKB_DATA_SOFTIRQ);
}
static inline void kunmap_skb_frag(void *vaddr)
diff --git a/net/core/link_watch.c b/net/core/link_watch.c
index 357bd4ee4ba..c3519c6d1b1 100644
--- a/net/core/link_watch.c
+++ b/net/core/link_watch.c
@@ -78,8 +78,13 @@ static void rfc2863_policy(struct net_device *dev)
static bool linkwatch_urgent_event(struct net_device *dev)
{
- return netif_running(dev) && netif_carrier_ok(dev) &&
- qdisc_tx_changing(dev);
+ if (!netif_running(dev))
+ return false;
+
+ if (dev->ifindex != dev->iflink)
+ return true;
+
+ return netif_carrier_ok(dev) && qdisc_tx_changing(dev);
}
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 1334d7e56f0..e287346e093 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -238,6 +238,7 @@ static void neigh_flush_dev(struct neigh_table *tbl, struct net_device *dev)
it to safe state.
*/
skb_queue_purge(&n->arp_queue);
+ n->arp_queue_len_bytes = 0;
n->output = neigh_blackhole;
if (n->nud_state & NUD_VALID)
n->nud_state = NUD_NOARP;
@@ -272,7 +273,7 @@ int neigh_ifdown(struct neigh_table *tbl, struct net_device *dev)
}
EXPORT_SYMBOL(neigh_ifdown);
-static struct neighbour *neigh_alloc(struct neigh_table *tbl)
+static struct neighbour *neigh_alloc(struct neigh_table *tbl, struct net_device *dev)
{
struct neighbour *n = NULL;
unsigned long now = jiffies;
@@ -287,7 +288,15 @@ static struct neighbour *neigh_alloc(struct neigh_table *tbl)
goto out_entries;
}
- n = kmem_cache_zalloc(tbl->kmem_cachep, GFP_ATOMIC);
+ if (tbl->entry_size)
+ n = kzalloc(tbl->entry_size, GFP_ATOMIC);
+ else {
+ int sz = sizeof(*n) + tbl->key_len;
+
+ sz = ALIGN(sz, NEIGH_PRIV_ALIGN);
+ sz += dev->neigh_priv_len;
+ n = kzalloc(sz, GFP_ATOMIC);
+ }
if (!n)
goto out_entries;
@@ -313,11 +322,18 @@ out_entries:
goto out;
}
+static void neigh_get_hash_rnd(u32 *x)
+{
+ get_random_bytes(x, sizeof(*x));
+ *x |= 1;
+}
+
static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
{
size_t size = (1 << shift) * sizeof(struct neighbour *);
struct neigh_hash_table *ret;
struct neighbour __rcu **buckets;
+ int i;
ret = kmalloc(sizeof(*ret), GFP_ATOMIC);
if (!ret)
@@ -334,8 +350,8 @@ static struct neigh_hash_table *neigh_hash_alloc(unsigned int shift)
}
ret->hash_buckets = buckets;
ret->hash_shift = shift;
- get_random_bytes(&ret->hash_rnd, sizeof(ret->hash_rnd));
- ret->hash_rnd |= 1;
+ for (i = 0; i < NEIGH_NUM_HASH_RND; i++)
+ neigh_get_hash_rnd(&ret->hash_rnd[i]);
return ret;
}
@@ -462,7 +478,7 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
u32 hash_val;
int key_len = tbl->key_len;
int error;
- struct neighbour *n1, *rc, *n = neigh_alloc(tbl);
+ struct neighbour *n1, *rc, *n = neigh_alloc(tbl, dev);
struct neigh_hash_table *nht;
if (!n) {
@@ -480,6 +496,14 @@ struct neighbour *neigh_create(struct neigh_table *tbl, const void *pkey,
goto out_neigh_release;
}
+ if (dev->netdev_ops->ndo_neigh_construct) {
+ error = dev->netdev_ops->ndo_neigh_construct(n);
+ if (error < 0) {
+ rc = ERR_PTR(error);
+ goto out_neigh_release;
+ }
+ }
+
/* Device specific setup. */
if (n->parms->neigh_setup &&
(error = n->parms->neigh_setup(n)) < 0) {
@@ -677,18 +701,14 @@ static inline void neigh_parms_put(struct neigh_parms *parms)
neigh_parms_destroy(parms);
}
-static void neigh_destroy_rcu(struct rcu_head *head)
-{
- struct neighbour *neigh = container_of(head, struct neighbour, rcu);
-
- kmem_cache_free(neigh->tbl->kmem_cachep, neigh);
-}
/*
* neighbour must already be out of the table;
*
*/
void neigh_destroy(struct neighbour *neigh)
{
+ struct net_device *dev = neigh->dev;
+
NEIGH_CACHE_STAT_INC(neigh->tbl, destroys);
if (!neigh->dead) {
@@ -702,14 +722,18 @@ void neigh_destroy(struct neighbour *neigh)
printk(KERN_WARNING "Impossible event.\n");
skb_queue_purge(&neigh->arp_queue);
+ neigh->arp_queue_len_bytes = 0;
- dev_put(neigh->dev);
+ if (dev->netdev_ops->ndo_neigh_destroy)
+ dev->netdev_ops->ndo_neigh_destroy(neigh);
+
+ dev_put(dev);
neigh_parms_put(neigh->parms);
NEIGH_PRINTK2("neigh %p is destroyed.\n", neigh);
atomic_dec(&neigh->tbl->entries);
- call_rcu(&neigh->rcu, neigh_destroy_rcu);
+ kfree_rcu(neigh, rcu);
}
EXPORT_SYMBOL(neigh_destroy);
@@ -842,6 +866,20 @@ static void neigh_invalidate(struct neighbour *neigh)
write_lock(&neigh->lock);
}
skb_queue_purge(&neigh->arp_queue);
+ neigh->arp_queue_len_bytes = 0;
+}
+
+static void neigh_probe(struct neighbour *neigh)
+ __releases(neigh->lock)
+{
+ struct sk_buff *skb = skb_peek(&neigh->arp_queue);
+ /* keep skb alive even if arp_queue overflows */
+ if (skb)
+ skb = skb_copy(skb, GFP_ATOMIC);
+ write_unlock(&neigh->lock);
+ neigh->ops->solicit(neigh, skb);
+ atomic_inc(&neigh->probes);
+ kfree_skb(skb);
}
/* Called when a timer expires for a neighbour entry. */
@@ -859,12 +897,8 @@ static void neigh_timer_handler(unsigned long arg)
now = jiffies;
next = now + HZ;
- if (!(state & NUD_IN_TIMER)) {
-#ifndef CONFIG_SMP
- printk(KERN_WARNING "neigh: timer & !nud_in_timer\n");
-#endif
+ if (!(state & NUD_IN_TIMER))
goto out;
- }
if (state & NUD_REACHABLE) {
if (time_before_eq(now,
@@ -920,14 +954,7 @@ static void neigh_timer_handler(unsigned long arg)
neigh_hold(neigh);
}
if (neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) {
- struct sk_buff *skb = skb_peek(&neigh->arp_queue);
- /* keep skb alive even if arp_queue overflows */
- if (skb)
- skb = skb_copy(skb, GFP_ATOMIC);
- write_unlock(&neigh->lock);
- neigh->ops->solicit(neigh, skb);
- atomic_inc(&neigh->probes);
- kfree_skb(skb);
+ neigh_probe(neigh);
} else {
out:
write_unlock(&neigh->lock);
@@ -942,7 +969,7 @@ out:
int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
{
int rc;
- unsigned long now;
+ bool immediate_probe = false;
write_lock_bh(&neigh->lock);
@@ -950,14 +977,16 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
if (neigh->nud_state & (NUD_CONNECTED | NUD_DELAY | NUD_PROBE))
goto out_unlock_bh;
- now = jiffies;
-
if (!(neigh->nud_state & (NUD_STALE | NUD_INCOMPLETE))) {
if (neigh->parms->mcast_probes + neigh->parms->app_probes) {
+ unsigned long next, now = jiffies;
+
atomic_set(&neigh->probes, neigh->parms->ucast_probes);
neigh->nud_state = NUD_INCOMPLETE;
- neigh->updated = jiffies;
- neigh_add_timer(neigh, now + 1);
+ neigh->updated = now;
+ next = now + max(neigh->parms->retrans_time, HZ/2);
+ neigh_add_timer(neigh, next);
+ immediate_probe = true;
} else {
neigh->nud_state = NUD_FAILED;
neigh->updated = jiffies;
@@ -976,20 +1005,29 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
if (neigh->nud_state == NUD_INCOMPLETE) {
if (skb) {
- if (skb_queue_len(&neigh->arp_queue) >=
- neigh->parms->queue_len) {
+ while (neigh->arp_queue_len_bytes + skb->truesize >
+ neigh->parms->queue_len_bytes) {
struct sk_buff *buff;
+
buff = __skb_dequeue(&neigh->arp_queue);
+ if (!buff)
+ break;
+ neigh->arp_queue_len_bytes -= buff->truesize;
kfree_skb(buff);
NEIGH_CACHE_STAT_INC(neigh->tbl, unres_discards);
}
skb_dst_force(skb);
__skb_queue_tail(&neigh->arp_queue, skb);
+ neigh->arp_queue_len_bytes += skb->truesize;
}
rc = 1;
}
out_unlock_bh:
- write_unlock_bh(&neigh->lock);
+ if (immediate_probe)
+ neigh_probe(neigh);
+ else
+ write_unlock(&neigh->lock);
+ local_bh_enable();
return rc;
}
EXPORT_SYMBOL(__neigh_event_send);
@@ -1156,13 +1194,18 @@ int neigh_update(struct neighbour *neigh, const u8 *lladdr, u8 new,
struct dst_entry *dst = skb_dst(skb);
struct neighbour *n2, *n1 = neigh;
write_unlock_bh(&neigh->lock);
+
+ rcu_read_lock();
/* On shaper/eql skb->dst->neighbour != neigh :( */
- if (dst && (n2 = dst_get_neighbour(dst)) != NULL)
+ if (dst && (n2 = dst_get_neighbour_noref(dst)) != NULL)
n1 = n2;
n1->output(n1, skb);
+ rcu_read_unlock();
+
write_lock_bh(&neigh->lock);
}
skb_queue_purge(&neigh->arp_queue);
+ neigh->arp_queue_len_bytes = 0;
}
out:
if (update_isrouter) {
@@ -1465,11 +1508,6 @@ void neigh_table_init_no_netlink(struct neigh_table *tbl)
tbl->parms.reachable_time =
neigh_rand_reach_time(tbl->parms.base_reachable_time);
- if (!tbl->kmem_cachep)
- tbl->kmem_cachep =
- kmem_cache_create(tbl->id, tbl->entry_size, 0,
- SLAB_HWCACHE_ALIGN|SLAB_PANIC,
- NULL);
tbl->stats = alloc_percpu(struct neigh_statistics);
if (!tbl->stats)
panic("cannot create neighbour cache statistics");
@@ -1554,9 +1592,6 @@ int neigh_table_clear(struct neigh_table *tbl)
free_percpu(tbl->stats);
tbl->stats = NULL;
- kmem_cache_destroy(tbl->kmem_cachep);
- tbl->kmem_cachep = NULL;
-
return 0;
}
EXPORT_SYMBOL(neigh_table_clear);
@@ -1735,7 +1770,11 @@ static int neightbl_fill_parms(struct sk_buff *skb, struct neigh_parms *parms)
NLA_PUT_U32(skb, NDTPA_IFINDEX, parms->dev->ifindex);
NLA_PUT_U32(skb, NDTPA_REFCNT, atomic_read(&parms->refcnt));
- NLA_PUT_U32(skb, NDTPA_QUEUE_LEN, parms->queue_len);
+ NLA_PUT_U32(skb, NDTPA_QUEUE_LENBYTES, parms->queue_len_bytes);
+ /* approximative value for deprecated QUEUE_LEN (in packets) */
+ NLA_PUT_U32(skb, NDTPA_QUEUE_LEN,
+ DIV_ROUND_UP(parms->queue_len_bytes,
+ SKB_TRUESIZE(ETH_FRAME_LEN)));
NLA_PUT_U32(skb, NDTPA_PROXY_QLEN, parms->proxy_qlen);
NLA_PUT_U32(skb, NDTPA_APP_PROBES, parms->app_probes);
NLA_PUT_U32(skb, NDTPA_UCAST_PROBES, parms->ucast_probes);
@@ -1796,7 +1835,7 @@ static int neightbl_fill_info(struct sk_buff *skb, struct neigh_table *tbl,
rcu_read_lock_bh();
nht = rcu_dereference_bh(tbl->nht);
- ndc.ndtc_hash_rnd = nht->hash_rnd;
+ ndc.ndtc_hash_rnd = nht->hash_rnd[0];
ndc.ndtc_hash_mask = ((1 << nht->hash_shift) - 1);
rcu_read_unlock_bh();
@@ -1962,7 +2001,11 @@ static int neightbl_set(struct sk_buff *skb, struct nlmsghdr *nlh, void *arg)
switch (i) {
case NDTPA_QUEUE_LEN:
- p->queue_len = nla_get_u32(tbp[i]);
+ p->queue_len_bytes = nla_get_u32(tbp[i]) *
+ SKB_TRUESIZE(ETH_FRAME_LEN);
+ break;
+ case NDTPA_QUEUE_LENBYTES:
+ p->queue_len_bytes = nla_get_u32(tbp[i]);
break;
case NDTPA_PROXY_QLEN:
p->proxy_qlen = nla_get_u32(tbp[i]);
@@ -2385,7 +2428,10 @@ static struct pneigh_entry *pneigh_get_next(struct seq_file *seq,
struct net *net = seq_file_net(seq);
struct neigh_table *tbl = state->tbl;
- pn = pn->next;
+ do {
+ pn = pn->next;
+ } while (pn && !net_eq(pneigh_net(pn), net));
+
while (!pn) {
if (++state->bucket > PNEIGH_HASHMASK)
break;
@@ -2623,117 +2669,158 @@ EXPORT_SYMBOL(neigh_app_ns);
#ifdef CONFIG_SYSCTL
-#define NEIGH_VARS_MAX 19
+static int proc_unres_qlen(ctl_table *ctl, int write, void __user *buffer,
+ size_t *lenp, loff_t *ppos)
+{
+ int size, ret;
+ ctl_table tmp = *ctl;
+
+ tmp.data = &size;
+ size = DIV_ROUND_UP(*(int *)ctl->data, SKB_TRUESIZE(ETH_FRAME_LEN));
+ ret = proc_dointvec(&tmp, write, buffer, lenp, ppos);
+ if (write && !ret)
+ *(int *)ctl->data = size * SKB_TRUESIZE(ETH_FRAME_LEN);
+ return ret;
+}
+
+enum {
+ NEIGH_VAR_MCAST_PROBE,
+ NEIGH_VAR_UCAST_PROBE,
+ NEIGH_VAR_APP_PROBE,
+ NEIGH_VAR_RETRANS_TIME,
+ NEIGH_VAR_BASE_REACHABLE_TIME,
+ NEIGH_VAR_DELAY_PROBE_TIME,
+ NEIGH_VAR_GC_STALETIME,
+ NEIGH_VAR_QUEUE_LEN,
+ NEIGH_VAR_QUEUE_LEN_BYTES,
+ NEIGH_VAR_PROXY_QLEN,
+ NEIGH_VAR_ANYCAST_DELAY,
+ NEIGH_VAR_PROXY_DELAY,
+ NEIGH_VAR_LOCKTIME,
+ NEIGH_VAR_RETRANS_TIME_MS,
+ NEIGH_VAR_BASE_REACHABLE_TIME_MS,
+ NEIGH_VAR_GC_INTERVAL,
+ NEIGH_VAR_GC_THRESH1,
+ NEIGH_VAR_GC_THRESH2,
+ NEIGH_VAR_GC_THRESH3,
+ NEIGH_VAR_MAX
+};
static struct neigh_sysctl_table {
struct ctl_table_header *sysctl_header;
- struct ctl_table neigh_vars[NEIGH_VARS_MAX];
+ struct ctl_table neigh_vars[NEIGH_VAR_MAX + 1];
char *dev_name;
} neigh_sysctl_template __read_mostly = {
.neigh_vars = {
- {
+ [NEIGH_VAR_MCAST_PROBE] = {
.procname = "mcast_solicit",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
+ [NEIGH_VAR_UCAST_PROBE] = {
.procname = "ucast_solicit",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
+ [NEIGH_VAR_APP_PROBE] = {
.procname = "app_solicit",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
+ [NEIGH_VAR_RETRANS_TIME] = {
.procname = "retrans_time",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_userhz_jiffies,
},
- {
+ [NEIGH_VAR_BASE_REACHABLE_TIME] = {
.procname = "base_reachable_time",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- {
+ [NEIGH_VAR_DELAY_PROBE_TIME] = {
.procname = "delay_first_probe_time",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- {
+ [NEIGH_VAR_GC_STALETIME] = {
.procname = "gc_stale_time",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- {
+ [NEIGH_VAR_QUEUE_LEN] = {
.procname = "unres_qlen",
.maxlen = sizeof(int),
.mode = 0644,
+ .proc_handler = proc_unres_qlen,
+ },
+ [NEIGH_VAR_QUEUE_LEN_BYTES] = {
+ .procname = "unres_qlen_bytes",
+ .maxlen = sizeof(int),
+ .mode = 0644,
.proc_handler = proc_dointvec,
},
- {
+ [NEIGH_VAR_PROXY_QLEN] = {
.procname = "proxy_qlen",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
+ [NEIGH_VAR_ANYCAST_DELAY] = {
.procname = "anycast_delay",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_userhz_jiffies,
},
- {
+ [NEIGH_VAR_PROXY_DELAY] = {
.procname = "proxy_delay",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_userhz_jiffies,
},
- {
+ [NEIGH_VAR_LOCKTIME] = {
.procname = "locktime",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_userhz_jiffies,
},
- {
+ [NEIGH_VAR_RETRANS_TIME_MS] = {
.procname = "retrans_time_ms",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
},
- {
+ [NEIGH_VAR_BASE_REACHABLE_TIME_MS] = {
.procname = "base_reachable_time_ms",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
},
- {
+ [NEIGH_VAR_GC_INTERVAL] = {
.procname = "gc_interval",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- {
+ [NEIGH_VAR_GC_THRESH1] = {
.procname = "gc_thresh1",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
+ [NEIGH_VAR_GC_THRESH2] = {
.procname = "gc_thresh2",
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec,
},
- {
+ [NEIGH_VAR_GC_THRESH3] = {
.procname = "gc_thresh3",
.maxlen = sizeof(int),
.mode = 0644,
@@ -2766,47 +2853,49 @@ int neigh_sysctl_register(struct net_device *dev, struct neigh_parms *p,
if (!t)
goto err;
- t->neigh_vars[0].data = &p->mcast_probes;
- t->neigh_vars[1].data = &p->ucast_probes;
- t->neigh_vars[2].data = &p->app_probes;
- t->neigh_vars[3].data = &p->retrans_time;
- t->neigh_vars[4].data = &p->base_reachable_time;
- t->neigh_vars[5].data = &p->delay_probe_time;
- t->neigh_vars[6].data = &p->gc_staletime;
- t->neigh_vars[7].data = &p->queue_len;
- t->neigh_vars[8].data = &p->proxy_qlen;
- t->neigh_vars[9].data = &p->anycast_delay;
- t->neigh_vars[10].data = &p->proxy_delay;
- t->neigh_vars[11].data = &p->locktime;
- t->neigh_vars[12].data = &p->retrans_time;
- t->neigh_vars[13].data = &p->base_reachable_time;
+ t->neigh_vars[NEIGH_VAR_MCAST_PROBE].data = &p->mcast_probes;
+ t->neigh_vars[NEIGH_VAR_UCAST_PROBE].data = &p->ucast_probes;
+ t->neigh_vars[NEIGH_VAR_APP_PROBE].data = &p->app_probes;
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME].data = &p->retrans_time;
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].data = &p->base_reachable_time;
+ t->neigh_vars[NEIGH_VAR_DELAY_PROBE_TIME].data = &p->delay_probe_time;
+ t->neigh_vars[NEIGH_VAR_GC_STALETIME].data = &p->gc_staletime;
+ t->neigh_vars[NEIGH_VAR_QUEUE_LEN].data = &p->queue_len_bytes;
+ t->neigh_vars[NEIGH_VAR_QUEUE_LEN_BYTES].data = &p->queue_len_bytes;
+ t->neigh_vars[NEIGH_VAR_PROXY_QLEN].data = &p->proxy_qlen;
+ t->neigh_vars[NEIGH_VAR_ANYCAST_DELAY].data = &p->anycast_delay;
+ t->neigh_vars[NEIGH_VAR_PROXY_DELAY].data = &p->proxy_delay;
+ t->neigh_vars[NEIGH_VAR_LOCKTIME].data = &p->locktime;
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].data = &p->retrans_time;
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].data = &p->base_reachable_time;
if (dev) {
dev_name_source = dev->name;
/* Terminate the table early */
- memset(&t->neigh_vars[14], 0, sizeof(t->neigh_vars[14]));
+ memset(&t->neigh_vars[NEIGH_VAR_GC_INTERVAL], 0,
+ sizeof(t->neigh_vars[NEIGH_VAR_GC_INTERVAL]));
} else {
dev_name_source = neigh_path[NEIGH_CTL_PATH_DEV].procname;
- t->neigh_vars[14].data = (int *)(p + 1);
- t->neigh_vars[15].data = (int *)(p + 1) + 1;
- t->neigh_vars[16].data = (int *)(p + 1) + 2;
- t->neigh_vars[17].data = (int *)(p + 1) + 3;
+ t->neigh_vars[NEIGH_VAR_GC_INTERVAL].data = (int *)(p + 1);
+ t->neigh_vars[NEIGH_VAR_GC_THRESH1].data = (int *)(p + 1) + 1;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH2].data = (int *)(p + 1) + 2;
+ t->neigh_vars[NEIGH_VAR_GC_THRESH3].data = (int *)(p + 1) + 3;
}
if (handler) {
/* RetransTime */
- t->neigh_vars[3].proc_handler = handler;
- t->neigh_vars[3].extra1 = dev;
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME].proc_handler = handler;
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME].extra1 = dev;
/* ReachableTime */
- t->neigh_vars[4].proc_handler = handler;
- t->neigh_vars[4].extra1 = dev;
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].proc_handler = handler;
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME].extra1 = dev;
/* RetransTime (in milliseconds)*/
- t->neigh_vars[12].proc_handler = handler;
- t->neigh_vars[12].extra1 = dev;
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].proc_handler = handler;
+ t->neigh_vars[NEIGH_VAR_RETRANS_TIME_MS].extra1 = dev;
/* ReachableTime (in milliseconds) */
- t->neigh_vars[13].proc_handler = handler;
- t->neigh_vars[13].extra1 = dev;
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].proc_handler = handler;
+ t->neigh_vars[NEIGH_VAR_BASE_REACHABLE_TIME_MS].extra1 = dev;
}
t->dev_name = kstrdup(dev_name_source, GFP_KERNEL);
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index 1683e5db2f2..f3dbd4f596a 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -20,6 +20,8 @@
#include <linux/rtnetlink.h>
#include <linux/wireless.h>
#include <linux/vmalloc.h>
+#include <linux/export.h>
+#include <linux/jiffies.h>
#include <net/wext.h>
#include "net-sysfs.h"
@@ -147,7 +149,7 @@ static ssize_t show_speed(struct device *dev,
if (netif_running(netdev)) {
struct ethtool_cmd cmd;
- if (!dev_ethtool_get_settings(netdev, &cmd))
+ if (!__ethtool_get_settings(netdev, &cmd))
ret = sprintf(buf, fmt_udec, ethtool_cmd_speed(&cmd));
}
rtnl_unlock();
@@ -165,7 +167,7 @@ static ssize_t show_duplex(struct device *dev,
if (netif_running(netdev)) {
struct ethtool_cmd cmd;
- if (!dev_ethtool_get_settings(netdev, &cmd))
+ if (!__ethtool_get_settings(netdev, &cmd))
ret = sprintf(buf, "%s\n",
cmd.duplex ? "full" : "half");
}
@@ -605,9 +607,12 @@ static ssize_t store_rps_map(struct netdev_rx_queue *queue,
rcu_assign_pointer(queue->rps_map, map);
spin_unlock(&rps_map_lock);
- if (old_map)
+ if (map)
+ jump_label_inc(&rps_needed);
+ if (old_map) {
kfree_rcu(old_map, rcu);
-
+ jump_label_dec(&rps_needed);
+ }
free_cpumask_var(mask);
return len;
}
@@ -617,15 +622,15 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
char *buf)
{
struct rps_dev_flow_table *flow_table;
- unsigned int val = 0;
+ unsigned long val = 0;
rcu_read_lock();
flow_table = rcu_dereference(queue->rps_flow_table);
if (flow_table)
- val = flow_table->mask + 1;
+ val = (unsigned long)flow_table->mask + 1;
rcu_read_unlock();
- return sprintf(buf, "%u\n", val);
+ return sprintf(buf, "%lu\n", val);
}
static void rps_dev_flow_table_release_work(struct work_struct *work)
@@ -649,33 +654,46 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
struct rx_queue_attribute *attr,
const char *buf, size_t len)
{
- unsigned int count;
- char *endp;
+ unsigned long mask, count;
struct rps_dev_flow_table *table, *old_table;
static DEFINE_SPINLOCK(rps_dev_flow_lock);
+ int rc;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- count = simple_strtoul(buf, &endp, 0);
- if (endp == buf)
- return -EINVAL;
+ rc = kstrtoul(buf, 0, &count);
+ if (rc < 0)
+ return rc;
if (count) {
- int i;
-
- if (count > 1<<30) {
+ mask = count - 1;
+ /* mask = roundup_pow_of_two(count) - 1;
+ * without overflows...
+ */
+ while ((mask | (mask >> 1)) != mask)
+ mask |= (mask >> 1);
+ /* On 64 bit arches, must check mask fits in table->mask (u32),
+ * and on 32bit arches, must check RPS_DEV_FLOW_TABLE_SIZE(mask + 1)
+ * doesnt overflow.
+ */
+#if BITS_PER_LONG > 32
+ if (mask > (unsigned long)(u32)mask)
+ return -EINVAL;
+#else
+ if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1))
+ / sizeof(struct rps_dev_flow)) {
/* Enforce a limit to prevent overflow */
return -EINVAL;
}
- count = roundup_pow_of_two(count);
- table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(count));
+#endif
+ table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1));
if (!table)
return -ENOMEM;
- table->mask = count - 1;
- for (i = 0; i < count; i++)
- table->flows[i].cpu = RPS_NO_CPU;
+ table->mask = mask;
+ for (count = 0; count <= mask; count++)
+ table->flows[count].cpu = RPS_NO_CPU;
} else
table = NULL;
@@ -712,13 +730,13 @@ static void rx_queue_release(struct kobject *kobj)
struct rps_dev_flow_table *flow_table;
- map = rcu_dereference_raw(queue->rps_map);
+ map = rcu_dereference_protected(queue->rps_map, 1);
if (map) {
RCU_INIT_POINTER(queue->rps_map, NULL);
kfree_rcu(map, rcu);
}
- flow_table = rcu_dereference_raw(queue->rps_flow_table);
+ flow_table = rcu_dereference_protected(queue->rps_flow_table, 1);
if (flow_table) {
RCU_INIT_POINTER(queue->rps_flow_table, NULL);
call_rcu(&flow_table->rcu, rps_dev_flow_table_release);
@@ -779,7 +797,7 @@ net_rx_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
#endif
}
-#ifdef CONFIG_XPS
+#ifdef CONFIG_SYSFS
/*
* netdev_queue sysfs structures and functions.
*/
@@ -825,6 +843,133 @@ static const struct sysfs_ops netdev_queue_sysfs_ops = {
.store = netdev_queue_attr_store,
};
+static ssize_t show_trans_timeout(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ char *buf)
+{
+ unsigned long trans_timeout;
+
+ spin_lock_irq(&queue->_xmit_lock);
+ trans_timeout = queue->trans_timeout;
+ spin_unlock_irq(&queue->_xmit_lock);
+
+ return sprintf(buf, "%lu", trans_timeout);
+}
+
+static struct netdev_queue_attribute queue_trans_timeout =
+ __ATTR(tx_timeout, S_IRUGO, show_trans_timeout, NULL);
+
+#ifdef CONFIG_BQL
+/*
+ * Byte queue limits sysfs structures and functions.
+ */
+static ssize_t bql_show(char *buf, unsigned int value)
+{
+ return sprintf(buf, "%u\n", value);
+}
+
+static ssize_t bql_set(const char *buf, const size_t count,
+ unsigned int *pvalue)
+{
+ unsigned int value;
+ int err;
+
+ if (!strcmp(buf, "max") || !strcmp(buf, "max\n"))
+ value = DQL_MAX_LIMIT;
+ else {
+ err = kstrtouint(buf, 10, &value);
+ if (err < 0)
+ return err;
+ if (value > DQL_MAX_LIMIT)
+ return -EINVAL;
+ }
+
+ *pvalue = value;
+
+ return count;
+}
+
+static ssize_t bql_show_hold_time(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attr,
+ char *buf)
+{
+ struct dql *dql = &queue->dql;
+
+ return sprintf(buf, "%u\n", jiffies_to_msecs(dql->slack_hold_time));
+}
+
+static ssize_t bql_set_hold_time(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attribute,
+ const char *buf, size_t len)
+{
+ struct dql *dql = &queue->dql;
+ unsigned value;
+ int err;
+
+ err = kstrtouint(buf, 10, &value);
+ if (err < 0)
+ return err;
+
+ dql->slack_hold_time = msecs_to_jiffies(value);
+
+ return len;
+}
+
+static struct netdev_queue_attribute bql_hold_time_attribute =
+ __ATTR(hold_time, S_IRUGO | S_IWUSR, bql_show_hold_time,
+ bql_set_hold_time);
+
+static ssize_t bql_show_inflight(struct netdev_queue *queue,
+ struct netdev_queue_attribute *attr,
+ char *buf)
+{
+ struct dql *dql = &queue->dql;
+
+ return sprintf(buf, "%u\n", dql->num_queued - dql->num_completed);
+}
+
+static struct netdev_queue_attribute bql_inflight_attribute =
+ __ATTR(inflight, S_IRUGO | S_IWUSR, bql_show_inflight, NULL);
+
+#define BQL_ATTR(NAME, FIELD) \
+static ssize_t bql_show_ ## NAME(struct netdev_queue *queue, \
+ struct netdev_queue_attribute *attr, \
+ char *buf) \
+{ \
+ return bql_show(buf, queue->dql.FIELD); \
+} \
+ \
+static ssize_t bql_set_ ## NAME(struct netdev_queue *queue, \
+ struct netdev_queue_attribute *attr, \
+ const char *buf, size_t len) \
+{ \
+ return bql_set(buf, len, &queue->dql.FIELD); \
+} \
+ \
+static struct netdev_queue_attribute bql_ ## NAME ## _attribute = \
+ __ATTR(NAME, S_IRUGO | S_IWUSR, bql_show_ ## NAME, \
+ bql_set_ ## NAME);
+
+BQL_ATTR(limit, limit)
+BQL_ATTR(limit_max, max_limit)
+BQL_ATTR(limit_min, min_limit)
+
+static struct attribute *dql_attrs[] = {
+ &bql_limit_attribute.attr,
+ &bql_limit_max_attribute.attr,
+ &bql_limit_min_attribute.attr,
+ &bql_hold_time_attribute.attr,
+ &bql_inflight_attribute.attr,
+ NULL
+};
+
+static struct attribute_group dql_group = {
+ .name = "byte_queue_limits",
+ .attrs = dql_attrs,
+};
+#endif /* CONFIG_BQL */
+
+#ifdef CONFIG_XPS
static inline unsigned int get_netdev_queue_index(struct netdev_queue *queue)
{
struct net_device *dev = queue->dev;
@@ -889,6 +1034,52 @@ static DEFINE_MUTEX(xps_map_mutex);
#define xmap_dereference(P) \
rcu_dereference_protected((P), lockdep_is_held(&xps_map_mutex))
+static void xps_queue_release(struct netdev_queue *queue)
+{
+ struct net_device *dev = queue->dev;
+ struct xps_dev_maps *dev_maps;
+ struct xps_map *map;
+ unsigned long index;
+ int i, pos, nonempty = 0;
+
+ index = get_netdev_queue_index(queue);
+
+ mutex_lock(&xps_map_mutex);
+ dev_maps = xmap_dereference(dev->xps_maps);
+
+ if (dev_maps) {
+ for_each_possible_cpu(i) {
+ map = xmap_dereference(dev_maps->cpu_map[i]);
+ if (!map)
+ continue;
+
+ for (pos = 0; pos < map->len; pos++)
+ if (map->queues[pos] == index)
+ break;
+
+ if (pos < map->len) {
+ if (map->len > 1)
+ map->queues[pos] =
+ map->queues[--map->len];
+ else {
+ RCU_INIT_POINTER(dev_maps->cpu_map[i],
+ NULL);
+ kfree_rcu(map, rcu);
+ map = NULL;
+ }
+ }
+ if (map)
+ nonempty = 1;
+ }
+
+ if (!nonempty) {
+ RCU_INIT_POINTER(dev->xps_maps, NULL);
+ kfree_rcu(dev_maps, rcu);
+ }
+ }
+ mutex_unlock(&xps_map_mutex);
+}
+
static ssize_t store_xps_map(struct netdev_queue *queue,
struct netdev_queue_attribute *attribute,
const char *buf, size_t len)
@@ -900,7 +1091,7 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
struct xps_map *map, *new_map;
struct xps_dev_maps *dev_maps, *new_dev_maps;
int nonempty = 0;
- int numa_node = -2;
+ int numa_node_id = -2;
if (!capable(CAP_NET_ADMIN))
return -EPERM;
@@ -943,10 +1134,10 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
need_set = cpumask_test_cpu(cpu, mask) && cpu_online(cpu);
#ifdef CONFIG_NUMA
if (need_set) {
- if (numa_node == -2)
- numa_node = cpu_to_node(cpu);
- else if (numa_node != cpu_to_node(cpu))
- numa_node = -1;
+ if (numa_node_id == -2)
+ numa_node_id = cpu_to_node(cpu);
+ else if (numa_node_id != cpu_to_node(cpu))
+ numa_node_id = -1;
}
#endif
if (need_set && pos >= map_len) {
@@ -986,17 +1177,17 @@ static ssize_t store_xps_map(struct netdev_queue *queue,
nonempty = 1;
}
- if (nonempty)
+ if (nonempty) {
rcu_assign_pointer(dev->xps_maps, new_dev_maps);
- else {
+ } else {
kfree(new_dev_maps);
- rcu_assign_pointer(dev->xps_maps, NULL);
+ RCU_INIT_POINTER(dev->xps_maps, NULL);
}
if (dev_maps)
kfree_rcu(dev_maps, rcu);
- netdev_queue_numa_node_write(queue, (numa_node >= 0) ? numa_node :
+ netdev_queue_numa_node_write(queue, (numa_node_id >= 0) ? numa_node_id :
NUMA_NO_NODE);
mutex_unlock(&xps_map_mutex);
@@ -1019,58 +1210,23 @@ error:
static struct netdev_queue_attribute xps_cpus_attribute =
__ATTR(xps_cpus, S_IRUGO | S_IWUSR, show_xps_map, store_xps_map);
+#endif /* CONFIG_XPS */
static struct attribute *netdev_queue_default_attrs[] = {
+ &queue_trans_timeout.attr,
+#ifdef CONFIG_XPS
&xps_cpus_attribute.attr,
+#endif
NULL
};
static void netdev_queue_release(struct kobject *kobj)
{
struct netdev_queue *queue = to_netdev_queue(kobj);
- struct net_device *dev = queue->dev;
- struct xps_dev_maps *dev_maps;
- struct xps_map *map;
- unsigned long index;
- int i, pos, nonempty = 0;
-
- index = get_netdev_queue_index(queue);
-
- mutex_lock(&xps_map_mutex);
- dev_maps = xmap_dereference(dev->xps_maps);
-
- if (dev_maps) {
- for_each_possible_cpu(i) {
- map = xmap_dereference(dev_maps->cpu_map[i]);
- if (!map)
- continue;
- for (pos = 0; pos < map->len; pos++)
- if (map->queues[pos] == index)
- break;
-
- if (pos < map->len) {
- if (map->len > 1)
- map->queues[pos] =
- map->queues[--map->len];
- else {
- RCU_INIT_POINTER(dev_maps->cpu_map[i],
- NULL);
- kfree_rcu(map, rcu);
- map = NULL;
- }
- }
- if (map)
- nonempty = 1;
- }
-
- if (!nonempty) {
- RCU_INIT_POINTER(dev->xps_maps, NULL);
- kfree_rcu(dev_maps, rcu);
- }
- }
-
- mutex_unlock(&xps_map_mutex);
+#ifdef CONFIG_XPS
+ xps_queue_release(queue);
+#endif
memset(kobj, 0, sizeof(*kobj));
dev_put(queue->dev);
@@ -1091,22 +1247,29 @@ static int netdev_queue_add_kobject(struct net_device *net, int index)
kobj->kset = net->queues_kset;
error = kobject_init_and_add(kobj, &netdev_queue_ktype, NULL,
"tx-%u", index);
- if (error) {
- kobject_put(kobj);
- return error;
- }
+ if (error)
+ goto exit;
+
+#ifdef CONFIG_BQL
+ error = sysfs_create_group(kobj, &dql_group);
+ if (error)
+ goto exit;
+#endif
kobject_uevent(kobj, KOBJ_ADD);
dev_hold(queue->dev);
+ return 0;
+exit:
+ kobject_put(kobj);
return error;
}
-#endif /* CONFIG_XPS */
+#endif /* CONFIG_SYSFS */
int
netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
{
-#ifdef CONFIG_XPS
+#ifdef CONFIG_SYSFS
int i;
int error = 0;
@@ -1118,20 +1281,26 @@ netdev_queue_update_kobjects(struct net_device *net, int old_num, int new_num)
}
}
- while (--i >= new_num)
- kobject_put(&net->_tx[i].kobj);
+ while (--i >= new_num) {
+ struct netdev_queue *queue = net->_tx + i;
+
+#ifdef CONFIG_BQL
+ sysfs_remove_group(&queue->kobj, &dql_group);
+#endif
+ kobject_put(&queue->kobj);
+ }
return error;
#else
return 0;
-#endif
+#endif /* CONFIG_SYSFS */
}
static int register_queue_kobjects(struct net_device *net)
{
int error = 0, txq = 0, rxq = 0, real_rx = 0, real_tx = 0;
-#if defined(CONFIG_RPS) || defined(CONFIG_XPS)
+#ifdef CONFIG_SYSFS
net->queues_kset = kset_create_and_add("queues",
NULL, &net->dev.kobj);
if (!net->queues_kset)
@@ -1172,7 +1341,7 @@ static void remove_queue_kobjects(struct net_device *net)
net_rx_queue_update_kobjects(net, real_rx, 0);
netdev_queue_update_kobjects(net, real_tx, 0);
-#if defined(CONFIG_RPS) || defined(CONFIG_XPS)
+#ifdef CONFIG_SYSFS
kset_unregister(net->queues_kset);
#endif
}
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index 52380b1d552..ba3c0120786 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -11,6 +11,7 @@
#include <linux/inetdevice.h>
#include <linux/inet.h>
#include <linux/interrupt.h>
+#include <linux/export.h>
#include <linux/netpoll.h>
#include <linux/sched.h>
#include <linux/delay.h>
diff --git a/net/core/net_namespace.c b/net/core/net_namespace.c
index 5bbdbf0d366..aefcd7acbff 100644
--- a/net/core/net_namespace.c
+++ b/net/core/net_namespace.c
@@ -10,6 +10,7 @@
#include <linux/nsproxy.h>
#include <linux/proc_fs.h>
#include <linux/file.h>
+#include <linux/export.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
diff --git a/net/core/netevent.c b/net/core/netevent.c
index 865f0ceb81f..f17ccd291d3 100644
--- a/net/core/netevent.c
+++ b/net/core/netevent.c
@@ -15,6 +15,7 @@
#include <linux/rtnetlink.h>
#include <linux/notifier.h>
+#include <linux/export.h>
#include <net/netevent.h>
static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain);
diff --git a/net/core/netpoll.c b/net/core/netpoll.c
index 52622517e0d..556b0829866 100644
--- a/net/core/netpoll.c
+++ b/net/core/netpoll.c
@@ -23,6 +23,7 @@
#include <linux/rcupdate.h>
#include <linux/workqueue.h>
#include <linux/slab.h>
+#include <linux/export.h>
#include <net/tcp.h>
#include <net/udp.h>
#include <asm/unaligned.h>
@@ -75,7 +76,7 @@ static void queue_process(struct work_struct *work)
local_irq_save(flags);
__netif_tx_lock(txq, smp_processor_id());
- if (netif_tx_queue_frozen_or_stopped(txq) ||
+ if (netif_xmit_frozen_or_stopped(txq) ||
ops->ndo_start_xmit(skb, dev) != NETDEV_TX_OK) {
skb_queue_head(&npinfo->txq, skb);
__netif_tx_unlock(txq);
@@ -316,7 +317,7 @@ void netpoll_send_skb_on_dev(struct netpoll *np, struct sk_buff *skb,
for (tries = jiffies_to_usecs(1)/USEC_PER_POLL;
tries > 0; --tries) {
if (__netif_tx_trylock(txq)) {
- if (!netif_tx_queue_stopped(txq)) {
+ if (!netif_xmit_stopped(txq)) {
status = ops->ndo_start_xmit(skb, dev);
if (status == NETDEV_TX_OK)
txq_trans_update(txq);
@@ -421,6 +422,7 @@ static void arp_reply(struct sk_buff *skb)
struct sk_buff *send_skb;
struct netpoll *np, *tmp;
unsigned long flags;
+ int hlen, tlen;
int hits = 0;
if (list_empty(&npinfo->rx_np))
@@ -478,8 +480,9 @@ static void arp_reply(struct sk_buff *skb)
if (tip != np->local_ip)
continue;
- send_skb = find_skb(np, size + LL_ALLOCATED_SPACE(np->dev),
- LL_RESERVED_SPACE(np->dev));
+ hlen = LL_RESERVED_SPACE(np->dev);
+ tlen = np->dev->needed_tailroom;
+ send_skb = find_skb(np, size + hlen + tlen, hlen);
if (!send_skb)
continue;
@@ -903,7 +906,7 @@ void __netpoll_cleanup(struct netpoll *np)
if (ops->ndo_netpoll_cleanup)
ops->ndo_netpoll_cleanup(np->dev);
- rcu_assign_pointer(np->dev->npinfo, NULL);
+ RCU_INIT_POINTER(np->dev->npinfo, NULL);
/* avoid racing with NAPI reading npinfo */
synchronize_rcu_bh();
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
new file mode 100644
index 00000000000..3a9fd4826b7
--- /dev/null
+++ b/net/core/netprio_cgroup.c
@@ -0,0 +1,344 @@
+/*
+ * net/core/netprio_cgroup.c Priority Control Group
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * Authors: Neil Horman <nhorman@tuxdriver.com>
+ */
+
+#include <linux/module.h>
+#include <linux/slab.h>
+#include <linux/types.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/cgroup.h>
+#include <linux/rcupdate.h>
+#include <linux/atomic.h>
+#include <net/rtnetlink.h>
+#include <net/pkt_cls.h>
+#include <net/sock.h>
+#include <net/netprio_cgroup.h>
+
+static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
+ struct cgroup *cgrp);
+static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp);
+static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp);
+
+struct cgroup_subsys net_prio_subsys = {
+ .name = "net_prio",
+ .create = cgrp_create,
+ .destroy = cgrp_destroy,
+ .populate = cgrp_populate,
+#ifdef CONFIG_NETPRIO_CGROUP
+ .subsys_id = net_prio_subsys_id,
+#endif
+ .module = THIS_MODULE
+};
+
+#define PRIOIDX_SZ 128
+
+static unsigned long prioidx_map[PRIOIDX_SZ];
+static DEFINE_SPINLOCK(prioidx_map_lock);
+static atomic_t max_prioidx = ATOMIC_INIT(0);
+
+static inline struct cgroup_netprio_state *cgrp_netprio_state(struct cgroup *cgrp)
+{
+ return container_of(cgroup_subsys_state(cgrp, net_prio_subsys_id),
+ struct cgroup_netprio_state, css);
+}
+
+static int get_prioidx(u32 *prio)
+{
+ unsigned long flags;
+ u32 prioidx;
+
+ spin_lock_irqsave(&prioidx_map_lock, flags);
+ prioidx = find_first_zero_bit(prioidx_map, sizeof(unsigned long) * PRIOIDX_SZ);
+ set_bit(prioidx, prioidx_map);
+ spin_unlock_irqrestore(&prioidx_map_lock, flags);
+ if (prioidx == sizeof(unsigned long) * PRIOIDX_SZ)
+ return -ENOSPC;
+
+ atomic_set(&max_prioidx, prioidx);
+ *prio = prioidx;
+ return 0;
+}
+
+static void put_prioidx(u32 idx)
+{
+ unsigned long flags;
+
+ spin_lock_irqsave(&prioidx_map_lock, flags);
+ clear_bit(idx, prioidx_map);
+ spin_unlock_irqrestore(&prioidx_map_lock, flags);
+}
+
+static void extend_netdev_table(struct net_device *dev, u32 new_len)
+{
+ size_t new_size = sizeof(struct netprio_map) +
+ ((sizeof(u32) * new_len));
+ struct netprio_map *new_priomap = kzalloc(new_size, GFP_KERNEL);
+ struct netprio_map *old_priomap;
+ int i;
+
+ old_priomap = rtnl_dereference(dev->priomap);
+
+ if (!new_priomap) {
+ printk(KERN_WARNING "Unable to alloc new priomap!\n");
+ return;
+ }
+
+ for (i = 0;
+ old_priomap && (i < old_priomap->priomap_len);
+ i++)
+ new_priomap->priomap[i] = old_priomap->priomap[i];
+
+ new_priomap->priomap_len = new_len;
+
+ rcu_assign_pointer(dev->priomap, new_priomap);
+ if (old_priomap)
+ kfree_rcu(old_priomap, rcu);
+}
+
+static void update_netdev_tables(void)
+{
+ struct net_device *dev;
+ u32 max_len = atomic_read(&max_prioidx);
+ struct netprio_map *map;
+
+ rtnl_lock();
+ for_each_netdev(&init_net, dev) {
+ map = rtnl_dereference(dev->priomap);
+ if ((!map) ||
+ (map->priomap_len < max_len))
+ extend_netdev_table(dev, max_len);
+ }
+ rtnl_unlock();
+}
+
+static struct cgroup_subsys_state *cgrp_create(struct cgroup_subsys *ss,
+ struct cgroup *cgrp)
+{
+ struct cgroup_netprio_state *cs;
+ int ret;
+
+ cs = kzalloc(sizeof(*cs), GFP_KERNEL);
+ if (!cs)
+ return ERR_PTR(-ENOMEM);
+
+ if (cgrp->parent && cgrp_netprio_state(cgrp->parent)->prioidx) {
+ kfree(cs);
+ return ERR_PTR(-EINVAL);
+ }
+
+ ret = get_prioidx(&cs->prioidx);
+ if (ret != 0) {
+ printk(KERN_WARNING "No space in priority index array\n");
+ kfree(cs);
+ return ERR_PTR(ret);
+ }
+
+ return &cs->css;
+}
+
+static void cgrp_destroy(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ struct cgroup_netprio_state *cs;
+ struct net_device *dev;
+ struct netprio_map *map;
+
+ cs = cgrp_netprio_state(cgrp);
+ rtnl_lock();
+ for_each_netdev(&init_net, dev) {
+ map = rtnl_dereference(dev->priomap);
+ if (map)
+ map->priomap[cs->prioidx] = 0;
+ }
+ rtnl_unlock();
+ put_prioidx(cs->prioidx);
+ kfree(cs);
+}
+
+static u64 read_prioidx(struct cgroup *cgrp, struct cftype *cft)
+{
+ return (u64)cgrp_netprio_state(cgrp)->prioidx;
+}
+
+static int read_priomap(struct cgroup *cont, struct cftype *cft,
+ struct cgroup_map_cb *cb)
+{
+ struct net_device *dev;
+ u32 prioidx = cgrp_netprio_state(cont)->prioidx;
+ u32 priority;
+ struct netprio_map *map;
+
+ rcu_read_lock();
+ for_each_netdev_rcu(&init_net, dev) {
+ map = rcu_dereference(dev->priomap);
+ priority = map ? map->priomap[prioidx] : 0;
+ cb->fill(cb, dev->name, priority);
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+static int write_priomap(struct cgroup *cgrp, struct cftype *cft,
+ const char *buffer)
+{
+ char *devname = kstrdup(buffer, GFP_KERNEL);
+ int ret = -EINVAL;
+ u32 prioidx = cgrp_netprio_state(cgrp)->prioidx;
+ unsigned long priority;
+ char *priostr;
+ struct net_device *dev;
+ struct netprio_map *map;
+
+ if (!devname)
+ return -ENOMEM;
+
+ /*
+ * Minimally sized valid priomap string
+ */
+ if (strlen(devname) < 3)
+ goto out_free_devname;
+
+ priostr = strstr(devname, " ");
+ if (!priostr)
+ goto out_free_devname;
+
+ /*
+ *Separate the devname from the associated priority
+ *and advance the priostr poitner to the priority value
+ */
+ *priostr = '\0';
+ priostr++;
+
+ /*
+ * If the priostr points to NULL, we're at the end of the passed
+ * in string, and its not a valid write
+ */
+ if (*priostr == '\0')
+ goto out_free_devname;
+
+ ret = kstrtoul(priostr, 10, &priority);
+ if (ret < 0)
+ goto out_free_devname;
+
+ ret = -ENODEV;
+
+ dev = dev_get_by_name(&init_net, devname);
+ if (!dev)
+ goto out_free_devname;
+
+ update_netdev_tables();
+ ret = 0;
+ rcu_read_lock();
+ map = rcu_dereference(dev->priomap);
+ if (map)
+ map->priomap[prioidx] = priority;
+ rcu_read_unlock();
+ dev_put(dev);
+
+out_free_devname:
+ kfree(devname);
+ return ret;
+}
+
+static struct cftype ss_files[] = {
+ {
+ .name = "prioidx",
+ .read_u64 = read_prioidx,
+ },
+ {
+ .name = "ifpriomap",
+ .read_map = read_priomap,
+ .write_string = write_priomap,
+ },
+};
+
+static int cgrp_populate(struct cgroup_subsys *ss, struct cgroup *cgrp)
+{
+ return cgroup_add_files(cgrp, ss, ss_files, ARRAY_SIZE(ss_files));
+}
+
+static int netprio_device_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = ptr;
+ struct netprio_map *old;
+ u32 max_len = atomic_read(&max_prioidx);
+
+ /*
+ * Note this is called with rtnl_lock held so we have update side
+ * protection on our rcu assignments
+ */
+
+ switch (event) {
+
+ case NETDEV_REGISTER:
+ if (max_len)
+ extend_netdev_table(dev, max_len);
+ break;
+ case NETDEV_UNREGISTER:
+ old = rtnl_dereference(dev->priomap);
+ RCU_INIT_POINTER(dev->priomap, NULL);
+ if (old)
+ kfree_rcu(old, rcu);
+ break;
+ }
+ return NOTIFY_DONE;
+}
+
+static struct notifier_block netprio_device_notifier = {
+ .notifier_call = netprio_device_event
+};
+
+static int __init init_cgroup_netprio(void)
+{
+ int ret;
+
+ ret = cgroup_load_subsys(&net_prio_subsys);
+ if (ret)
+ goto out;
+#ifndef CONFIG_NETPRIO_CGROUP
+ smp_wmb();
+ net_prio_subsys_id = net_prio_subsys.subsys_id;
+#endif
+
+ register_netdevice_notifier(&netprio_device_notifier);
+
+out:
+ return ret;
+}
+
+static void __exit exit_cgroup_netprio(void)
+{
+ struct netprio_map *old;
+ struct net_device *dev;
+
+ unregister_netdevice_notifier(&netprio_device_notifier);
+
+ cgroup_unload_subsys(&net_prio_subsys);
+
+#ifndef CONFIG_NETPRIO_CGROUP
+ net_prio_subsys_id = -1;
+ synchronize_rcu();
+#endif
+
+ rtnl_lock();
+ for_each_netdev(&init_net, dev) {
+ old = rtnl_dereference(dev->priomap);
+ RCU_INIT_POINTER(dev->priomap, NULL);
+ if (old)
+ kfree_rcu(old, rcu);
+ }
+ rtnl_unlock();
+}
+
+module_init(init_cgroup_netprio);
+module_exit(exit_cgroup_netprio);
+MODULE_LICENSE("GPL v2");
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index e35a6fbb811..65f80c7b165 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -1304,7 +1304,7 @@ static ssize_t pktgen_if_write(struct file *file,
scan_ip6(buf, pkt_dev->in6_daddr.s6_addr);
snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_daddr);
- ipv6_addr_copy(&pkt_dev->cur_in6_daddr, &pkt_dev->in6_daddr);
+ pkt_dev->cur_in6_daddr = pkt_dev->in6_daddr;
if (debug)
printk(KERN_DEBUG "pktgen: dst6 set to: %s\n", buf);
@@ -1327,8 +1327,7 @@ static ssize_t pktgen_if_write(struct file *file,
scan_ip6(buf, pkt_dev->min_in6_daddr.s6_addr);
snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->min_in6_daddr);
- ipv6_addr_copy(&pkt_dev->cur_in6_daddr,
- &pkt_dev->min_in6_daddr);
+ pkt_dev->cur_in6_daddr = pkt_dev->min_in6_daddr;
if (debug)
printk(KERN_DEBUG "pktgen: dst6_min set to: %s\n", buf);
@@ -1371,7 +1370,7 @@ static ssize_t pktgen_if_write(struct file *file,
scan_ip6(buf, pkt_dev->in6_saddr.s6_addr);
snprintf(buf, sizeof(buf), "%pI6c", &pkt_dev->in6_saddr);
- ipv6_addr_copy(&pkt_dev->cur_in6_saddr, &pkt_dev->in6_saddr);
+ pkt_dev->cur_in6_saddr = pkt_dev->in6_saddr;
if (debug)
printk(KERN_DEBUG "pktgen: src6 set to: %s\n", buf);
@@ -2025,13 +2024,13 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
pr_warning("WARNING: Requested queue_map_min (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
pkt_dev->queue_map_min, (ntxq ?: 1) - 1, ntxq,
pkt_dev->odevname);
- pkt_dev->queue_map_min = ntxq - 1;
+ pkt_dev->queue_map_min = (ntxq ?: 1) - 1;
}
if (pkt_dev->queue_map_max >= ntxq) {
pr_warning("WARNING: Requested queue_map_max (zero-based) (%d) exceeds valid range [0 - %d] for (%d) queues on %s, resetting\n",
pkt_dev->queue_map_max, (ntxq ?: 1) - 1, ntxq,
pkt_dev->odevname);
- pkt_dev->queue_map_max = ntxq - 1;
+ pkt_dev->queue_map_max = (ntxq ?: 1) - 1;
}
/* Default to the interface's mac if not explicitly set. */
@@ -2079,9 +2078,7 @@ static void pktgen_setup_inject(struct pktgen_dev *pkt_dev)
ifp = ifp->if_next) {
if (ifp->scope == IFA_LINK &&
!(ifp->flags & IFA_F_TENTATIVE)) {
- ipv6_addr_copy(&pkt_dev->
- cur_in6_saddr,
- &ifp->addr);
+ pkt_dev->cur_in6_saddr = ifp->addr;
err = 0;
break;
}
@@ -2145,9 +2142,12 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
}
start_time = ktime_now();
- if (remaining < 100000)
- ndelay(remaining); /* really small just spin */
- else {
+ if (remaining < 100000) {
+ /* for small delays (<100us), just loop until limit is reached */
+ do {
+ end_time = ktime_now();
+ } while (ktime_lt(end_time, spin_until));
+ } else {
/* see do_nanosleep */
hrtimer_init_sleeper(&t, current);
do {
@@ -2162,8 +2162,8 @@ static void spin(struct pktgen_dev *pkt_dev, ktime_t spin_until)
hrtimer_cancel(&t.timer);
} while (t.task && pkt_dev->running && !signal_pending(current));
__set_current_state(TASK_RUNNING);
+ end_time = ktime_now();
}
- end_time = ktime_now();
pkt_dev->idle_acc += ktime_to_ns(ktime_sub(end_time, start_time));
pkt_dev->next_tx = ktime_add_ns(spin_until, pkt_dev->delay);
@@ -2602,18 +2602,18 @@ static void pktgen_finalize_skb(struct pktgen_dev *pkt_dev, struct sk_buff *skb,
if (!pkt_dev->page)
break;
}
- skb_shinfo(skb)->frags[i].page = pkt_dev->page;
get_page(pkt_dev->page);
+ skb_frag_set_page(skb, i, pkt_dev->page);
skb_shinfo(skb)->frags[i].page_offset = 0;
/*last fragment, fill rest of data*/
if (i == (frags - 1))
- skb_shinfo(skb)->frags[i].size =
- (datalen < PAGE_SIZE ? datalen : PAGE_SIZE);
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i],
+ (datalen < PAGE_SIZE ? datalen : PAGE_SIZE));
else
- skb_shinfo(skb)->frags[i].size = frag_len;
- datalen -= skb_shinfo(skb)->frags[i].size;
- skb->len += skb_shinfo(skb)->frags[i].size;
- skb->data_len += skb_shinfo(skb)->frags[i].size;
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i], frag_len);
+ datalen -= skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ skb->len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
+ skb->data_len += skb_frag_size(&skb_shinfo(skb)->frags[i]);
i++;
skb_shinfo(skb)->nr_frags = i;
}
@@ -2955,8 +2955,8 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
iph->payload_len = htons(sizeof(struct udphdr) + datalen);
iph->nexthdr = IPPROTO_UDP;
- ipv6_addr_copy(&iph->daddr, &pkt_dev->cur_in6_daddr);
- ipv6_addr_copy(&iph->saddr, &pkt_dev->cur_in6_saddr);
+ iph->daddr = pkt_dev->cur_in6_daddr;
+ iph->saddr = pkt_dev->cur_in6_saddr;
skb->mac_header = (skb->network_header - ETH_HLEN -
pkt_dev->pkt_overhead);
@@ -3342,7 +3342,7 @@ static void pktgen_xmit(struct pktgen_dev *pkt_dev)
__netif_tx_lock_bh(txq);
- if (unlikely(netif_tx_queue_frozen_or_stopped(txq))) {
+ if (unlikely(netif_xmit_frozen_or_stopped(txq))) {
ret = NETDEV_TX_BUSY;
pkt_dev->last_ok = 0;
goto unlock;
diff --git a/net/core/request_sock.c b/net/core/request_sock.c
index 182236b2510..9b570a6a33c 100644
--- a/net/core/request_sock.c
+++ b/net/core/request_sock.c
@@ -26,10 +26,11 @@
* but then some measure against one socket starving all other sockets
* would be needed.
*
- * It was 128 by default. Experiments with real servers show, that
+ * The minimum value of it is 128. Experiments with real servers show that
* it is absolutely not enough even at 100conn/sec. 256 cures most
- * of problems. This value is adjusted to 128 for very small machines
- * (<=32Mb of memory) and to 1024 on normal or better ones (>=256Mb).
+ * of problems.
+ * This value is adjusted to 128 for low memory machines,
+ * and it will increase in proportion to the memory of machine.
* Note : Dont forget somaxconn that may limit backlog too.
*/
int sysctl_max_syn_backlog = 256;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index d3a62819671..f16444bc6cb 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -273,6 +273,17 @@ EXPORT_SYMBOL_GPL(rtnl_unregister_all);
static LIST_HEAD(link_ops);
+static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind)
+{
+ const struct rtnl_link_ops *ops;
+
+ list_for_each_entry(ops, &link_ops, list) {
+ if (!strcmp(ops->kind, kind))
+ return ops;
+ }
+ return NULL;
+}
+
/**
* __rtnl_link_register - Register rtnl_link_ops with rtnetlink.
* @ops: struct rtnl_link_ops * to register
@@ -285,6 +296,9 @@ static LIST_HEAD(link_ops);
*/
int __rtnl_link_register(struct rtnl_link_ops *ops)
{
+ if (rtnl_link_ops_get(ops->kind))
+ return -EEXIST;
+
if (!ops->dellink)
ops->dellink = unregister_netdevice_queue;
@@ -351,17 +365,6 @@ void rtnl_link_unregister(struct rtnl_link_ops *ops)
}
EXPORT_SYMBOL_GPL(rtnl_link_unregister);
-static const struct rtnl_link_ops *rtnl_link_ops_get(const char *kind)
-{
- const struct rtnl_link_ops *ops;
-
- list_for_each_entry(ops, &link_ops, list) {
- if (!strcmp(ops->kind, kind))
- return ops;
- }
- return NULL;
-}
-
static size_t rtnl_link_get_size(const struct net_device *dev)
{
const struct rtnl_link_ops *ops = dev->rtnl_link_ops;
@@ -731,7 +734,8 @@ static inline int rtnl_vfinfo_size(const struct net_device *dev)
size += num_vfs *
(nla_total_size(sizeof(struct ifla_vf_mac)) +
nla_total_size(sizeof(struct ifla_vf_vlan)) +
- nla_total_size(sizeof(struct ifla_vf_tx_rate)));
+ nla_total_size(sizeof(struct ifla_vf_tx_rate)) +
+ nla_total_size(sizeof(struct ifla_vf_spoofchk)));
return size;
} else
return 0;
@@ -954,13 +958,27 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
struct ifla_vf_mac vf_mac;
struct ifla_vf_vlan vf_vlan;
struct ifla_vf_tx_rate vf_tx_rate;
+ struct ifla_vf_spoofchk vf_spoofchk;
+
+ /*
+ * Not all SR-IOV capable drivers support the
+ * spoofcheck query. Preset to -1 so the user
+ * space tool can detect that the driver didn't
+ * report anything.
+ */
+ ivi.spoofchk = -1;
if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi))
break;
- vf_mac.vf = vf_vlan.vf = vf_tx_rate.vf = ivi.vf;
+ vf_mac.vf =
+ vf_vlan.vf =
+ vf_tx_rate.vf =
+ vf_spoofchk.vf = ivi.vf;
+
memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
vf_vlan.vlan = ivi.vlan;
vf_vlan.qos = ivi.qos;
vf_tx_rate.rate = ivi.tx_rate;
+ vf_spoofchk.setting = ivi.spoofchk;
vf = nla_nest_start(skb, IFLA_VF_INFO);
if (!vf) {
nla_nest_cancel(skb, vfinfo);
@@ -968,7 +986,10 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
}
NLA_PUT(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac);
NLA_PUT(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan);
- NLA_PUT(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate), &vf_tx_rate);
+ NLA_PUT(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate),
+ &vf_tx_rate);
+ NLA_PUT(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk),
+ &vf_spoofchk);
nla_nest_end(skb, vf);
}
nla_nest_end(skb, vfinfo);
@@ -1202,6 +1223,15 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr *attr)
ivt->rate);
break;
}
+ case IFLA_VF_SPOOFCHK: {
+ struct ifla_vf_spoofchk *ivs;
+ ivs = nla_data(vf);
+ err = -EOPNOTSUPP;
+ if (ops->ndo_set_vf_spoofchk)
+ err = ops->ndo_set_vf_spoofchk(dev, ivs->vf,
+ ivs->setting);
+ break;
+ }
default:
err = -EINVAL;
break;
@@ -1604,7 +1634,6 @@ struct net_device *rtnl_create_link(struct net *src_net, struct net *net,
dev_net_set(dev, net);
dev->rtnl_link_ops = ops;
dev->rtnl_link_state = RTNL_LINK_INITIALIZING;
- dev->real_num_tx_queues = real_num_queues;
if (tb[IFLA_MTU])
dev->mtu = nla_get_u32(tb[IFLA_MTU]);
diff --git a/net/core/scm.c b/net/core/scm.c
index 811b53fb330..ff52ad0a515 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -173,7 +173,7 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
if (err)
goto error;
- if (pid_vnr(p->pid) != p->creds.pid) {
+ if (!p->pid || pid_vnr(p->pid) != p->creds.pid) {
struct pid *pid;
err = -ESRCH;
pid = find_get_pid(p->creds.pid);
@@ -183,8 +183,9 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
p->pid = pid;
}
- if ((p->cred->euid != p->creds.uid) ||
- (p->cred->egid != p->creds.gid)) {
+ if (!p->cred ||
+ (p->cred->euid != p->creds.uid) ||
+ (p->cred->egid != p->creds.gid)) {
struct cred *cred;
err = -ENOMEM;
cred = prepare_creds();
@@ -193,7 +194,8 @@ int __scm_send(struct socket *sock, struct msghdr *msg, struct scm_cookie *p)
cred->uid = cred->euid = p->creds.uid;
cred->gid = cred->egid = p->creds.gid;
- put_cred(p->cred);
+ if (p->cred)
+ put_cred(p->cred);
p->cred = cred;
}
break;
diff --git a/net/core/secure_seq.c b/net/core/secure_seq.c
index 45329d7c9dd..6fd44606fdd 100644
--- a/net/core/secure_seq.c
+++ b/net/core/secure_seq.c
@@ -19,6 +19,7 @@ static int __init net_secret_init(void)
}
late_initcall(net_secret_init);
+#ifdef CONFIG_INET
static u32 seq_scale(u32 seq)
{
/*
@@ -33,9 +34,10 @@ static u32 seq_scale(u32 seq)
*/
return seq + (ktime_to_ns(ktime_get_real()) >> 6);
}
+#endif
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
-__u32 secure_tcpv6_sequence_number(__be32 *saddr, __be32 *daddr,
+#if IS_ENABLED(CONFIG_IPV6)
+__u32 secure_tcpv6_sequence_number(const __be32 *saddr, const __be32 *daddr,
__be16 sport, __be16 dport)
{
u32 secret[MD5_MESSAGE_BYTES / 4];
@@ -132,7 +134,7 @@ u32 secure_ipv4_port_ephemeral(__be32 saddr, __be32 daddr, __be16 dport)
EXPORT_SYMBOL_GPL(secure_ipv4_port_ephemeral);
#endif
-#if defined(CONFIG_IP_DCCP) || defined(CONFIG_IP_DCCP_MODULE)
+#if IS_ENABLED(CONFIG_IP_DCCP)
u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
__be16 sport, __be16 dport)
{
@@ -154,7 +156,7 @@ u64 secure_dccp_sequence_number(__be32 saddr, __be32 daddr,
}
EXPORT_SYMBOL(secure_dccp_sequence_number);
-#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
+#if IS_ENABLED(CONFIG_IPV6)
u64 secure_dccpv6_sequence_number(__be32 *saddr, __be32 *daddr,
__be16 sport, __be16 dport)
{
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index 387703f56fc..da0c97f2fab 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -184,11 +184,21 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
goto out;
prefetchw(skb);
+ /* We do our best to align skb_shared_info on a separate cache
+ * line. It usually works because kmalloc(X > SMP_CACHE_BYTES) gives
+ * aligned memory blocks, unless SLUB/SLAB debug is enabled.
+ * Both skb->head and skb_shared_info are cache line aligned.
+ */
size = SKB_DATA_ALIGN(size);
- data = kmalloc_node_track_caller(size + sizeof(struct skb_shared_info),
- gfp_mask, node);
+ size += SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+ data = kmalloc_node_track_caller(size, gfp_mask, node);
if (!data)
goto nodata;
+ /* kmalloc(size) might give us more room than requested.
+ * Put skb_shared_info exactly at the end of allocated zone,
+ * to allow max possible filling before reallocation.
+ */
+ size = SKB_WITH_OVERHEAD(ksize(data));
prefetchw(data + size);
/*
@@ -197,7 +207,8 @@ struct sk_buff *__alloc_skb(unsigned int size, gfp_t gfp_mask,
* the tail pointer in struct sk_buff!
*/
memset(skb, 0, offsetof(struct sk_buff, tail));
- skb->truesize = size + sizeof(struct sk_buff);
+ /* Account for allocated memory : skb + skb->head */
+ skb->truesize = SKB_TRUESIZE(size);
atomic_set(&skb->users, 1);
skb->head = data;
skb->data = data;
@@ -234,6 +245,55 @@ nodata:
EXPORT_SYMBOL(__alloc_skb);
/**
+ * build_skb - build a network buffer
+ * @data: data buffer provided by caller
+ *
+ * Allocate a new &sk_buff. Caller provides space holding head and
+ * skb_shared_info. @data must have been allocated by kmalloc()
+ * The return is the new skb buffer.
+ * On a failure the return is %NULL, and @data is not freed.
+ * Notes :
+ * Before IO, driver allocates only data buffer where NIC put incoming frame
+ * Driver should add room at head (NET_SKB_PAD) and
+ * MUST add room at tail (SKB_DATA_ALIGN(skb_shared_info))
+ * After IO, driver calls build_skb(), to allocate sk_buff and populate it
+ * before giving packet to stack.
+ * RX rings only contains data buffers, not full skbs.
+ */
+struct sk_buff *build_skb(void *data)
+{
+ struct skb_shared_info *shinfo;
+ struct sk_buff *skb;
+ unsigned int size;
+
+ skb = kmem_cache_alloc(skbuff_head_cache, GFP_ATOMIC);
+ if (!skb)
+ return NULL;
+
+ size = ksize(data) - SKB_DATA_ALIGN(sizeof(struct skb_shared_info));
+
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ skb->truesize = SKB_TRUESIZE(size);
+ atomic_set(&skb->users, 1);
+ skb->head = data;
+ skb->data = data;
+ skb_reset_tail_pointer(skb);
+ skb->end = skb->tail + size;
+#ifdef NET_SKBUFF_DATA_USES_OFFSET
+ skb->mac_header = ~0U;
+#endif
+
+ /* make sure we initialize shinfo sequentially */
+ shinfo = skb_shinfo(skb);
+ memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
+ atomic_set(&shinfo->dataref, 1);
+ kmemcheck_annotate_variable(shinfo->destructor_arg);
+
+ return skb;
+}
+EXPORT_SYMBOL(build_skb);
+
+/**
* __netdev_alloc_skb - allocate an skbuff for rx on a specific device
* @dev: network device to receive on
* @length: length to allocate
@@ -326,7 +386,7 @@ static void skb_release_data(struct sk_buff *skb)
if (skb_shinfo(skb)->nr_frags) {
int i;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- put_page(skb_shinfo(skb)->frags[i].page);
+ skb_frag_unref(skb, i);
}
/*
@@ -392,7 +452,7 @@ static void skb_release_head_state(struct sk_buff *skb)
WARN_ON(in_irq());
skb->destructor(skb);
}
-#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
+#if IS_ENABLED(CONFIG_NF_CONNTRACK)
nf_conntrack_put(skb->nfct);
#endif
#ifdef NET_SKBUFF_NF_DEFRAG_NEEDED
@@ -475,6 +535,30 @@ void consume_skb(struct sk_buff *skb)
EXPORT_SYMBOL(consume_skb);
/**
+ * skb_recycle - clean up an skb for reuse
+ * @skb: buffer
+ *
+ * Recycles the skb to be reused as a receive buffer. This
+ * function does any necessary reference count dropping, and
+ * cleans up the skbuff as if it just came from __alloc_skb().
+ */
+void skb_recycle(struct sk_buff *skb)
+{
+ struct skb_shared_info *shinfo;
+
+ skb_release_head_state(skb);
+
+ shinfo = skb_shinfo(skb);
+ memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
+ atomic_set(&shinfo->dataref, 1);
+
+ memset(skb, 0, offsetof(struct sk_buff, tail));
+ skb->data = skb->head + NET_SKB_PAD;
+ skb_reset_tail_pointer(skb);
+}
+EXPORT_SYMBOL(skb_recycle);
+
+/**
* skb_recycle_check - check if skb can be reused for receive
* @skb: buffer
* @skb_size: minimum receive buffer size
@@ -488,33 +572,10 @@ EXPORT_SYMBOL(consume_skb);
*/
bool skb_recycle_check(struct sk_buff *skb, int skb_size)
{
- struct skb_shared_info *shinfo;
-
- if (irqs_disabled())
+ if (!skb_is_recycleable(skb, skb_size))
return false;
- if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY)
- return false;
-
- if (skb_is_nonlinear(skb) || skb->fclone != SKB_FCLONE_UNAVAILABLE)
- return false;
-
- skb_size = SKB_DATA_ALIGN(skb_size + NET_SKB_PAD);
- if (skb_end_pointer(skb) - skb->head < skb_size)
- return false;
-
- if (skb_shared(skb) || skb_cloned(skb))
- return false;
-
- skb_release_head_state(skb);
-
- shinfo = skb_shinfo(skb);
- memset(shinfo, 0, offsetof(struct skb_shared_info, dataref));
- atomic_set(&shinfo->dataref, 1);
-
- memset(skb, 0, offsetof(struct sk_buff, tail));
- skb->data = skb->head + NET_SKB_PAD;
- skb_reset_tail_pointer(skb);
+ skb_recycle(skb);
return true;
}
@@ -529,6 +590,8 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->mac_header = old->mac_header;
skb_dst_copy(new, old);
new->rxhash = old->rxhash;
+ new->ooo_okay = old->ooo_okay;
+ new->l4_rxhash = old->l4_rxhash;
#ifdef CONFIG_XFRM
new->sp = secpath_get(old->sp);
#endif
@@ -539,15 +602,14 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->ip_summed = old->ip_summed;
skb_copy_queue_mapping(new, old);
new->priority = old->priority;
-#if defined(CONFIG_IP_VS) || defined(CONFIG_IP_VS_MODULE)
+#if IS_ENABLED(CONFIG_IP_VS)
new->ipvs_property = old->ipvs_property;
#endif
new->protocol = old->protocol;
new->mark = old->mark;
new->skb_iif = old->skb_iif;
__nf_copy(new, old);
-#if defined(CONFIG_NETFILTER_XT_TARGET_TRACE) || \
- defined(CONFIG_NETFILTER_XT_TARGET_TRACE_MODULE)
+#if IS_ENABLED(CONFIG_NETFILTER_XT_TARGET_TRACE)
new->nf_trace = old->nf_trace;
#endif
#ifdef CONFIG_NET_SCHED
@@ -647,7 +709,7 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
}
vaddr = kmap_skb_frag(&skb_shinfo(skb)->frags[i]);
memcpy(page_address(page),
- vaddr + f->page_offset, f->size);
+ vaddr + f->page_offset, skb_frag_size(f));
kunmap_skb_frag(vaddr);
page->private = (unsigned long)head;
head = page;
@@ -655,14 +717,14 @@ int skb_copy_ubufs(struct sk_buff *skb, gfp_t gfp_mask)
/* skb frags release userspace buffers */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- put_page(skb_shinfo(skb)->frags[i].page);
+ skb_frag_unref(skb, i);
uarg->callback(uarg);
/* skb frags point to kernel buffers */
for (i = skb_shinfo(skb)->nr_frags; i > 0; i--) {
- skb_shinfo(skb)->frags[i - 1].page_offset = 0;
- skb_shinfo(skb)->frags[i - 1].page = head;
+ __skb_fill_page_desc(skb, i-1, head, 0,
+ skb_shinfo(skb)->frags[i - 1].size);
head = (struct page *)head->private;
}
@@ -777,8 +839,9 @@ struct sk_buff *skb_copy(const struct sk_buff *skb, gfp_t gfp_mask)
EXPORT_SYMBOL(skb_copy);
/**
- * pskb_copy - create copy of an sk_buff with private head.
+ * __pskb_copy - create copy of an sk_buff with private head.
* @skb: buffer to copy
+ * @headroom: headroom of new skb
* @gfp_mask: allocation priority
*
* Make a copy of both an &sk_buff and part of its data, located
@@ -789,16 +852,16 @@ EXPORT_SYMBOL(skb_copy);
* The returned buffer has a reference count of 1.
*/
-struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
+struct sk_buff *__pskb_copy(struct sk_buff *skb, int headroom, gfp_t gfp_mask)
{
- unsigned int size = skb_end_pointer(skb) - skb->head;
+ unsigned int size = skb_headlen(skb) + headroom;
struct sk_buff *n = alloc_skb(size, gfp_mask);
if (!n)
goto out;
/* Set the data pointer */
- skb_reserve(n, skb_headroom(skb));
+ skb_reserve(n, headroom);
/* Set the tail pointer and length */
skb_put(n, skb_headlen(skb));
/* Copy the bytes */
@@ -820,7 +883,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
}
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
skb_shinfo(n)->frags[i] = skb_shinfo(skb)->frags[i];
- get_page(skb_shinfo(n)->frags[i].page);
+ skb_frag_ref(skb, i);
}
skb_shinfo(n)->nr_frags = i;
}
@@ -834,7 +897,7 @@ struct sk_buff *pskb_copy(struct sk_buff *skb, gfp_t gfp_mask)
out:
return n;
}
-EXPORT_SYMBOL(pskb_copy);
+EXPORT_SYMBOL(__pskb_copy);
/**
* pskb_expand_head - reallocate header of &sk_buff
@@ -911,7 +974,7 @@ int pskb_expand_head(struct sk_buff *skb, int nhead, int ntail,
goto nofrags;
}
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++)
- get_page(skb_shinfo(skb)->frags[i].page);
+ skb_frag_ref(skb, i);
if (skb_has_frag_list(skb))
skb_clone_fraglist(skb);
@@ -1178,20 +1241,20 @@ int ___pskb_trim(struct sk_buff *skb, unsigned int len)
goto drop_pages;
for (; i < nfrags; i++) {
- int end = offset + skb_shinfo(skb)->frags[i].size;
+ int end = offset + skb_frag_size(&skb_shinfo(skb)->frags[i]);
if (end < len) {
offset = end;
continue;
}
- skb_shinfo(skb)->frags[i++].size = len - offset;
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i++], len - offset);
drop_pages:
skb_shinfo(skb)->nr_frags = i;
for (; i < nfrags; i++)
- put_page(skb_shinfo(skb)->frags[i].page);
+ skb_frag_unref(skb, i);
if (skb_has_frag_list(skb))
skb_drop_fraglist(skb);
@@ -1294,9 +1357,11 @@ unsigned char *__pskb_pull_tail(struct sk_buff *skb, int delta)
/* Estimate size of pulled pages. */
eat = delta;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- if (skb_shinfo(skb)->frags[i].size >= eat)
+ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+ if (size >= eat)
goto pull_pages;
- eat -= skb_shinfo(skb)->frags[i].size;
+ eat -= size;
}
/* If we need update frag list, we are in troubles.
@@ -1359,14 +1424,16 @@ pull_pages:
eat = delta;
k = 0;
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
- if (skb_shinfo(skb)->frags[i].size <= eat) {
- put_page(skb_shinfo(skb)->frags[i].page);
- eat -= skb_shinfo(skb)->frags[i].size;
+ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
+
+ if (size <= eat) {
+ skb_frag_unref(skb, i);
+ eat -= size;
} else {
skb_shinfo(skb)->frags[k] = skb_shinfo(skb)->frags[i];
if (eat) {
skb_shinfo(skb)->frags[k].page_offset += eat;
- skb_shinfo(skb)->frags[k].size -= eat;
+ skb_frag_size_sub(&skb_shinfo(skb)->frags[k], eat);
eat = 0;
}
k++;
@@ -1421,7 +1488,7 @@ int skb_copy_bits(const struct sk_buff *skb, int offset, void *to, int len)
WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
if ((copy = end - offset) > 0) {
u8 *vaddr;
@@ -1619,7 +1686,8 @@ static int __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
for (seg = 0; seg < skb_shinfo(skb)->nr_frags; seg++) {
const skb_frag_t *f = &skb_shinfo(skb)->frags[seg];
- if (__splice_segment(f->page, f->page_offset, f->size,
+ if (__splice_segment(skb_frag_page(f),
+ f->page_offset, skb_frag_size(f),
offset, len, skb, spd, 0, sk, pipe))
return 1;
}
@@ -1729,7 +1797,7 @@ int skb_store_bits(struct sk_buff *skb, int offset, const void *from, int len)
WARN_ON(start > offset + len);
- end = start + frag->size;
+ end = start + skb_frag_size(frag);
if ((copy = end - offset) > 0) {
u8 *vaddr;
@@ -1802,7 +1870,7 @@ __wsum skb_checksum(const struct sk_buff *skb, int offset,
WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
if ((copy = end - offset) > 0) {
__wsum csum2;
u8 *vaddr;
@@ -1877,7 +1945,7 @@ __wsum skb_copy_and_csum_bits(const struct sk_buff *skb, int offset,
WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
if ((copy = end - offset) > 0) {
__wsum csum2;
u8 *vaddr;
@@ -2150,7 +2218,7 @@ static inline void skb_split_no_header(struct sk_buff *skb,
skb->data_len = len - pos;
for (i = 0; i < nfrags; i++) {
- int size = skb_shinfo(skb)->frags[i].size;
+ int size = skb_frag_size(&skb_shinfo(skb)->frags[i]);
if (pos + size > len) {
skb_shinfo(skb1)->frags[k] = skb_shinfo(skb)->frags[i];
@@ -2164,10 +2232,10 @@ static inline void skb_split_no_header(struct sk_buff *skb,
* where splitting is expensive.
* 2. Split is accurately. We make this.
*/
- get_page(skb_shinfo(skb)->frags[i].page);
+ skb_frag_ref(skb, i);
skb_shinfo(skb1)->frags[0].page_offset += len - pos;
- skb_shinfo(skb1)->frags[0].size -= len - pos;
- skb_shinfo(skb)->frags[i].size = len - pos;
+ skb_frag_size_sub(&skb_shinfo(skb1)->frags[0], len - pos);
+ skb_frag_size_set(&skb_shinfo(skb)->frags[i], len - pos);
skb_shinfo(skb)->nr_frags++;
}
k++;
@@ -2211,7 +2279,7 @@ static int skb_prepare_for_shift(struct sk_buff *skb)
* @shiftlen: shift up to this many bytes
*
* Attempts to shift up to shiftlen worth of bytes, which may be less than
- * the length of the skb, from tgt to skb. Returns number bytes shifted.
+ * the length of the skb, from skb to tgt. Returns number bytes shifted.
* It's up to caller to free skb if everything was shifted.
*
* If @tgt runs out of frags, the whole operation is aborted.
@@ -2239,12 +2307,13 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
* commit all, so that we don't have to undo partial changes
*/
if (!to ||
- !skb_can_coalesce(tgt, to, fragfrom->page, fragfrom->page_offset)) {
+ !skb_can_coalesce(tgt, to, skb_frag_page(fragfrom),
+ fragfrom->page_offset)) {
merge = -1;
} else {
merge = to - 1;
- todo -= fragfrom->size;
+ todo -= skb_frag_size(fragfrom);
if (todo < 0) {
if (skb_prepare_for_shift(skb) ||
skb_prepare_for_shift(tgt))
@@ -2254,8 +2323,8 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
fragfrom = &skb_shinfo(skb)->frags[from];
fragto = &skb_shinfo(tgt)->frags[merge];
- fragto->size += shiftlen;
- fragfrom->size -= shiftlen;
+ skb_frag_size_add(fragto, shiftlen);
+ skb_frag_size_sub(fragfrom, shiftlen);
fragfrom->page_offset += shiftlen;
goto onlymerged;
@@ -2279,20 +2348,20 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
fragfrom = &skb_shinfo(skb)->frags[from];
fragto = &skb_shinfo(tgt)->frags[to];
- if (todo >= fragfrom->size) {
+ if (todo >= skb_frag_size(fragfrom)) {
*fragto = *fragfrom;
- todo -= fragfrom->size;
+ todo -= skb_frag_size(fragfrom);
from++;
to++;
} else {
- get_page(fragfrom->page);
+ __skb_frag_ref(fragfrom);
fragto->page = fragfrom->page;
fragto->page_offset = fragfrom->page_offset;
- fragto->size = todo;
+ skb_frag_size_set(fragto, todo);
fragfrom->page_offset += todo;
- fragfrom->size -= todo;
+ skb_frag_size_sub(fragfrom, todo);
todo = 0;
to++;
@@ -2307,8 +2376,8 @@ int skb_shift(struct sk_buff *tgt, struct sk_buff *skb, int shiftlen)
fragfrom = &skb_shinfo(skb)->frags[0];
fragto = &skb_shinfo(tgt)->frags[merge];
- fragto->size += fragfrom->size;
- put_page(fragfrom->page);
+ skb_frag_size_add(fragto, skb_frag_size(fragfrom));
+ __skb_frag_unref(fragfrom);
}
/* Reposition in the original skb */
@@ -2405,7 +2474,7 @@ next_skb:
while (st->frag_idx < skb_shinfo(st->cur_skb)->nr_frags) {
frag = &skb_shinfo(st->cur_skb)->frags[st->frag_idx];
- block_limit = frag->size + st->stepped_offset;
+ block_limit = skb_frag_size(frag) + st->stepped_offset;
if (abs_offset < block_limit) {
if (!st->frag_data)
@@ -2423,7 +2492,7 @@ next_skb:
}
st->frag_idx++;
- st->stepped_offset += frag->size;
+ st->stepped_offset += skb_frag_size(frag);
}
if (st->frag_data) {
@@ -2553,14 +2622,13 @@ int skb_append_datato_frags(struct sock *sk, struct sk_buff *skb,
left = PAGE_SIZE - frag->page_offset;
copy = (length > left)? left : length;
- ret = getfrag(from, (page_address(frag->page) +
- frag->page_offset + frag->size),
+ ret = getfrag(from, skb_frag_address(frag) + skb_frag_size(frag),
offset, copy, 0, skb);
if (ret < 0)
return -EFAULT;
/* copy was successful so update the size parameters */
- frag->size += copy;
+ skb_frag_size_add(frag, copy);
skb->len += copy;
skb->data_len += copy;
offset += copy;
@@ -2602,7 +2670,7 @@ EXPORT_SYMBOL_GPL(skb_pull_rcsum);
* a pointer to the first in a list of new skbs for the segments.
* In case of error it returns ERR_PTR(err).
*/
-struct sk_buff *skb_segment(struct sk_buff *skb, u32 features)
+struct sk_buff *skb_segment(struct sk_buff *skb, netdev_features_t features)
{
struct sk_buff *segs = NULL;
struct sk_buff *tail = NULL;
@@ -2706,12 +2774,12 @@ struct sk_buff *skb_segment(struct sk_buff *skb, u32 features)
while (pos < offset + len && i < nfrags) {
*frag = skb_shinfo(skb)->frags[i];
- get_page(frag->page);
- size = frag->size;
+ __skb_frag_ref(frag);
+ size = skb_frag_size(frag);
if (pos < offset) {
frag->page_offset += offset - pos;
- frag->size -= offset - pos;
+ skb_frag_size_sub(frag, offset - pos);
}
skb_shinfo(nskb)->nr_frags++;
@@ -2720,7 +2788,7 @@ struct sk_buff *skb_segment(struct sk_buff *skb, u32 features)
i++;
pos += size;
} else {
- frag->size -= pos + size - (offset + len);
+ skb_frag_size_sub(frag, pos + size - (offset + len));
goto skip_fraglist;
}
@@ -2800,7 +2868,7 @@ int skb_gro_receive(struct sk_buff **head, struct sk_buff *skb)
} while (--i);
frag->page_offset += offset;
- frag->size -= offset;
+ skb_frag_size_sub(frag, offset);
skb->truesize -= skb->data_len;
skb->len -= skb->data_len;
@@ -2852,7 +2920,7 @@ merge:
unsigned int eat = offset - headlen;
skbinfo->frags[0].page_offset += eat;
- skbinfo->frags[0].size -= eat;
+ skb_frag_size_sub(&skbinfo->frags[0], eat);
skb->data_len -= eat;
skb->len -= eat;
offset = headlen;
@@ -2923,13 +2991,13 @@ __skb_to_sgvec(struct sk_buff *skb, struct scatterlist *sg, int offset, int len)
WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(&skb_shinfo(skb)->frags[i]);
if ((copy = end - offset) > 0) {
skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
if (copy > len)
copy = len;
- sg_set_page(&sg[elt], frag->page, copy,
+ sg_set_page(&sg[elt], skb_frag_page(frag), copy,
frag->page_offset+offset-start);
elt++;
if (!(len -= copy))
@@ -3150,6 +3218,26 @@ void skb_tstamp_tx(struct sk_buff *orig_skb,
}
EXPORT_SYMBOL_GPL(skb_tstamp_tx);
+void skb_complete_wifi_ack(struct sk_buff *skb, bool acked)
+{
+ struct sock *sk = skb->sk;
+ struct sock_exterr_skb *serr;
+ int err;
+
+ skb->wifi_acked_valid = 1;
+ skb->wifi_acked = acked;
+
+ serr = SKB_EXT_ERR(skb);
+ memset(serr, 0, sizeof(*serr));
+ serr->ee.ee_errno = ENOMSG;
+ serr->ee.ee_origin = SO_EE_ORIGIN_TXSTATUS;
+
+ err = sock_queue_err_skb(sk, skb);
+ if (err)
+ kfree_skb(skb);
+}
+EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
+
/**
* skb_partial_csum_set - set up and verify partial csum values for packet
diff --git a/net/core/sock.c b/net/core/sock.c
index bc745d00ea4..5c5af9988f9 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -111,6 +111,8 @@
#include <linux/init.h>
#include <linux/highmem.h>
#include <linux/user_namespace.h>
+#include <linux/jump_label.h>
+#include <linux/memcontrol.h>
#include <asm/uaccess.h>
#include <asm/system.h>
@@ -125,6 +127,7 @@
#include <net/xfrm.h>
#include <linux/ipsec.h>
#include <net/cls_cgroup.h>
+#include <net/netprio_cgroup.h>
#include <linux/filter.h>
@@ -134,6 +137,46 @@
#include <net/tcp.h>
#endif
+static DEFINE_MUTEX(proto_list_mutex);
+static LIST_HEAD(proto_list);
+
+#ifdef CONFIG_CGROUP_MEM_RES_CTLR_KMEM
+int mem_cgroup_sockets_init(struct cgroup *cgrp, struct cgroup_subsys *ss)
+{
+ struct proto *proto;
+ int ret = 0;
+
+ mutex_lock(&proto_list_mutex);
+ list_for_each_entry(proto, &proto_list, node) {
+ if (proto->init_cgroup) {
+ ret = proto->init_cgroup(cgrp, ss);
+ if (ret)
+ goto out;
+ }
+ }
+
+ mutex_unlock(&proto_list_mutex);
+ return ret;
+out:
+ list_for_each_entry_continue_reverse(proto, &proto_list, node)
+ if (proto->destroy_cgroup)
+ proto->destroy_cgroup(cgrp, ss);
+ mutex_unlock(&proto_list_mutex);
+ return ret;
+}
+
+void mem_cgroup_sockets_destroy(struct cgroup *cgrp, struct cgroup_subsys *ss)
+{
+ struct proto *proto;
+
+ mutex_lock(&proto_list_mutex);
+ list_for_each_entry_reverse(proto, &proto_list, node)
+ if (proto->destroy_cgroup)
+ proto->destroy_cgroup(cgrp, ss);
+ mutex_unlock(&proto_list_mutex);
+}
+#endif
+
/*
* Each address family might have different locking rules, so we have
* one slock key per address family:
@@ -141,6 +184,9 @@
static struct lock_class_key af_family_keys[AF_MAX];
static struct lock_class_key af_family_slock_keys[AF_MAX];
+struct jump_label_key memcg_socket_limit_enabled;
+EXPORT_SYMBOL(memcg_socket_limit_enabled);
+
/*
* Make lock validator output more readable. (we pre-construct these
* strings build-time, so that runtime initialization of socket
@@ -207,7 +253,7 @@ static struct lock_class_key af_callback_keys[AF_MAX];
* not depend upon such differences.
*/
#define _SK_MEM_PACKETS 256
-#define _SK_MEM_OVERHEAD (sizeof(struct sk_buff) + 256)
+#define _SK_MEM_OVERHEAD SKB_TRUESIZE(256)
#define SK_WMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
#define SK_RMEM_MAX (_SK_MEM_OVERHEAD * _SK_MEM_PACKETS)
@@ -221,10 +267,16 @@ __u32 sysctl_rmem_default __read_mostly = SK_RMEM_MAX;
int sysctl_optmem_max __read_mostly = sizeof(unsigned long)*(2*UIO_MAXIOV+512);
EXPORT_SYMBOL(sysctl_optmem_max);
-#if defined(CONFIG_CGROUPS) && !defined(CONFIG_NET_CLS_CGROUP)
+#if defined(CONFIG_CGROUPS)
+#if !defined(CONFIG_NET_CLS_CGROUP)
int net_cls_subsys_id = -1;
EXPORT_SYMBOL_GPL(net_cls_subsys_id);
#endif
+#if !defined(CONFIG_NETPRIO_CGROUP)
+int net_prio_subsys_id = -1;
+EXPORT_SYMBOL_GPL(net_prio_subsys_id);
+#endif
+#endif
static int sock_set_timeout(long *timeo_p, char __user *optval, int optlen)
{
@@ -269,14 +321,14 @@ static void sock_warn_obsolete_bsdism(const char *name)
}
}
-static void sock_disable_timestamp(struct sock *sk, int flag)
+#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
+
+static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
{
- if (sock_flag(sk, flag)) {
- sock_reset_flag(sk, flag);
- if (!sock_flag(sk, SOCK_TIMESTAMP) &&
- !sock_flag(sk, SOCK_TIMESTAMPING_RX_SOFTWARE)) {
+ if (sk->sk_flags & flags) {
+ sk->sk_flags &= ~flags;
+ if (!(sk->sk_flags & SK_FLAGS_TIMESTAMP))
net_disable_timestamp();
- }
}
}
@@ -288,11 +340,7 @@ int sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
unsigned long flags;
struct sk_buff_head *list = &sk->sk_receive_queue;
- /* Cast sk->rcvbuf to unsigned... It's pointless, but reduces
- number of warnings when compiling with -W --ANK
- */
- if (atomic_read(&sk->sk_rmem_alloc) + skb->truesize >=
- (unsigned)sk->sk_rcvbuf) {
+ if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf) {
atomic_inc(&sk->sk_drops);
trace_sock_rcvqueue_full(sk, skb);
return -ENOMEM;
@@ -387,7 +435,7 @@ struct dst_entry *__sk_dst_check(struct sock *sk, u32 cookie)
if (dst && dst->obsolete && dst->ops->check(dst, cookie) == NULL) {
sk_tx_queue_clear(sk);
- rcu_assign_pointer(sk->sk_dst_cache, NULL);
+ RCU_INIT_POINTER(sk->sk_dst_cache, NULL);
dst_release(dst);
return NULL;
}
@@ -682,7 +730,7 @@ set_rcvbuf:
SOCK_TIMESTAMPING_RX_SOFTWARE);
else
sock_disable_timestamp(sk,
- SOCK_TIMESTAMPING_RX_SOFTWARE);
+ (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE));
sock_valbool_flag(sk, SOCK_TIMESTAMPING_SOFTWARE,
val & SOF_TIMESTAMPING_SOFTWARE);
sock_valbool_flag(sk, SOCK_TIMESTAMPING_SYS_HARDWARE,
@@ -738,11 +786,13 @@ set_rcvbuf:
/* We implement the SO_SNDLOWAT etc to
not be settable (1003.1g 5.3) */
case SO_RXQ_OVFL:
- if (valbool)
- sock_set_flag(sk, SOCK_RXQ_OVFL);
- else
- sock_reset_flag(sk, SOCK_RXQ_OVFL);
+ sock_valbool_flag(sk, SOCK_RXQ_OVFL, valbool);
break;
+
+ case SO_WIFI_STATUS:
+ sock_valbool_flag(sk, SOCK_WIFI_STATUS, valbool);
+ break;
+
default:
ret = -ENOPROTOOPT;
break;
@@ -964,6 +1014,10 @@ int sock_getsockopt(struct socket *sock, int level, int optname,
v.val = !!sock_flag(sk, SOCK_RXQ_OVFL);
break;
+ case SO_WIFI_STATUS:
+ v.val = !!sock_flag(sk, SOCK_WIFI_STATUS);
+ break;
+
default:
return -ENOPROTOOPT;
}
@@ -1114,6 +1168,18 @@ void sock_update_classid(struct sock *sk)
sk->sk_classid = classid;
}
EXPORT_SYMBOL(sock_update_classid);
+
+void sock_update_netprioidx(struct sock *sk)
+{
+ struct cgroup_netprio_state *state;
+ if (in_interrupt())
+ return;
+ rcu_read_lock();
+ state = task_netprio_state(current);
+ sk->sk_cgrp_prioidx = state ? state->prioidx : 0;
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL_GPL(sock_update_netprioidx);
#endif
/**
@@ -1141,6 +1207,7 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
atomic_set(&sk->sk_wmem_alloc, 1);
sock_update_classid(sk);
+ sock_update_netprioidx(sk);
}
return sk;
@@ -1158,11 +1225,10 @@ static void __sk_free(struct sock *sk)
atomic_read(&sk->sk_wmem_alloc) == 0);
if (filter) {
sk_filter_uncharge(sk, filter);
- rcu_assign_pointer(sk->sk_filter, NULL);
+ RCU_INIT_POINTER(sk->sk_filter, NULL);
}
- sock_disable_timestamp(sk, SOCK_TIMESTAMP);
- sock_disable_timestamp(sk, SOCK_TIMESTAMPING_RX_SOFTWARE);
+ sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
if (atomic_read(&sk->sk_omem_alloc))
printk(KERN_DEBUG "%s: optmem leakage (%d bytes) detected.\n",
@@ -1207,7 +1273,20 @@ void sk_release_kernel(struct sock *sk)
}
EXPORT_SYMBOL(sk_release_kernel);
-struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
+static void sk_update_clone(const struct sock *sk, struct sock *newsk)
+{
+ if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
+ sock_update_memcg(newsk);
+}
+
+/**
+ * sk_clone_lock - clone a socket, and lock its clone
+ * @sk: the socket to clone
+ * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
+ *
+ * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
+ */
+struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
{
struct sock *newsk;
@@ -1260,6 +1339,7 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
/* It is still raw copy of parent, so invalidate
* destructor and make plain sk_free() */
newsk->sk_destruct = NULL;
+ bh_unlock_sock(newsk);
sk_free(newsk);
newsk = NULL;
goto out;
@@ -1289,17 +1369,18 @@ struct sock *sk_clone(const struct sock *sk, const gfp_t priority)
sk_set_socket(newsk, NULL);
newsk->sk_wq = NULL;
+ sk_update_clone(sk, newsk);
+
if (newsk->sk_prot->sockets_allocated)
- percpu_counter_inc(newsk->sk_prot->sockets_allocated);
+ sk_sockets_allocated_inc(newsk);
- if (sock_flag(newsk, SOCK_TIMESTAMP) ||
- sock_flag(newsk, SOCK_TIMESTAMPING_RX_SOFTWARE))
+ if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
net_enable_timestamp();
}
out:
return newsk;
}
-EXPORT_SYMBOL_GPL(sk_clone);
+EXPORT_SYMBOL_GPL(sk_clone_lock);
void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
@@ -1533,7 +1614,6 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
skb_shinfo(skb)->nr_frags = npages;
for (i = 0; i < npages; i++) {
struct page *page;
- skb_frag_t *frag;
page = alloc_pages(sk->sk_allocation, 0);
if (!page) {
@@ -1543,12 +1623,11 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
goto failure;
}
- frag = &skb_shinfo(skb)->frags[i];
- frag->page = page;
- frag->page_offset = 0;
- frag->size = (data_len >= PAGE_SIZE ?
- PAGE_SIZE :
- data_len);
+ __skb_fill_page_desc(skb, i,
+ page, 0,
+ (data_len >= PAGE_SIZE ?
+ PAGE_SIZE :
+ data_len));
data_len -= PAGE_SIZE;
}
@@ -1681,30 +1760,34 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
struct proto *prot = sk->sk_prot;
int amt = sk_mem_pages(size);
long allocated;
+ int parent_status = UNDER_LIMIT;
sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
- allocated = atomic_long_add_return(amt, prot->memory_allocated);
+
+ allocated = sk_memory_allocated_add(sk, amt, &parent_status);
/* Under limit. */
- if (allocated <= prot->sysctl_mem[0]) {
- if (prot->memory_pressure && *prot->memory_pressure)
- *prot->memory_pressure = 0;
+ if (parent_status == UNDER_LIMIT &&
+ allocated <= sk_prot_mem_limits(sk, 0)) {
+ sk_leave_memory_pressure(sk);
return 1;
}
- /* Under pressure. */
- if (allocated > prot->sysctl_mem[1])
- if (prot->enter_memory_pressure)
- prot->enter_memory_pressure(sk);
+ /* Under pressure. (we or our parents) */
+ if ((parent_status > SOFT_LIMIT) ||
+ allocated > sk_prot_mem_limits(sk, 1))
+ sk_enter_memory_pressure(sk);
- /* Over hard limit. */
- if (allocated > prot->sysctl_mem[2])
+ /* Over hard limit (we or our parents) */
+ if ((parent_status == OVER_LIMIT) ||
+ (allocated > sk_prot_mem_limits(sk, 2)))
goto suppress_allocation;
/* guarantee minimum buffer size under pressure */
if (kind == SK_MEM_RECV) {
if (atomic_read(&sk->sk_rmem_alloc) < prot->sysctl_rmem[0])
return 1;
+
} else { /* SK_MEM_SEND */
if (sk->sk_type == SOCK_STREAM) {
if (sk->sk_wmem_queued < prot->sysctl_wmem[0])
@@ -1714,13 +1797,13 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
return 1;
}
- if (prot->memory_pressure) {
+ if (sk_has_memory_pressure(sk)) {
int alloc;
- if (!*prot->memory_pressure)
+ if (!sk_under_memory_pressure(sk))
return 1;
- alloc = percpu_counter_read_positive(prot->sockets_allocated);
- if (prot->sysctl_mem[2] > alloc *
+ alloc = sk_sockets_allocated_read_positive(sk);
+ if (sk_prot_mem_limits(sk, 2) > alloc *
sk_mem_pages(sk->sk_wmem_queued +
atomic_read(&sk->sk_rmem_alloc) +
sk->sk_forward_alloc))
@@ -1743,7 +1826,9 @@ suppress_allocation:
/* Alas. Undo changes. */
sk->sk_forward_alloc -= amt * SK_MEM_QUANTUM;
- atomic_long_sub(amt, prot->memory_allocated);
+
+ sk_memory_allocated_sub(sk, amt, parent_status);
+
return 0;
}
EXPORT_SYMBOL(__sk_mem_schedule);
@@ -1754,15 +1839,13 @@ EXPORT_SYMBOL(__sk_mem_schedule);
*/
void __sk_mem_reclaim(struct sock *sk)
{
- struct proto *prot = sk->sk_prot;
-
- atomic_long_sub(sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT,
- prot->memory_allocated);
+ sk_memory_allocated_sub(sk,
+ sk->sk_forward_alloc >> SK_MEM_QUANTUM_SHIFT, 0);
sk->sk_forward_alloc &= SK_MEM_QUANTUM - 1;
- if (prot->memory_pressure && *prot->memory_pressure &&
- (atomic_long_read(prot->memory_allocated) < prot->sysctl_mem[0]))
- *prot->memory_pressure = 0;
+ if (sk_under_memory_pressure(sk) &&
+ (sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
+ sk_leave_memory_pressure(sk);
}
EXPORT_SYMBOL(__sk_mem_reclaim);
@@ -2133,16 +2216,15 @@ EXPORT_SYMBOL(sock_get_timestampns);
void sock_enable_timestamp(struct sock *sk, int flag)
{
if (!sock_flag(sk, flag)) {
+ unsigned long previous_flags = sk->sk_flags;
+
sock_set_flag(sk, flag);
/*
* we just set one of the two flags which require net
* time stamping, but time stamping might have been on
* already because of the other one
*/
- if (!sock_flag(sk,
- flag == SOCK_TIMESTAMP ?
- SOCK_TIMESTAMPING_RX_SOFTWARE :
- SOCK_TIMESTAMP))
+ if (!(previous_flags & SK_FLAGS_TIMESTAMP))
net_enable_timestamp();
}
}
@@ -2254,9 +2336,6 @@ void sk_common_release(struct sock *sk)
}
EXPORT_SYMBOL(sk_common_release);
-static DEFINE_RWLOCK(proto_list_lock);
-static LIST_HEAD(proto_list);
-
#ifdef CONFIG_PROC_FS
#define PROTO_INUSE_NR 64 /* should be enough for the first time */
struct prot_inuse {
@@ -2405,10 +2484,10 @@ int proto_register(struct proto *prot, int alloc_slab)
}
}
- write_lock(&proto_list_lock);
+ mutex_lock(&proto_list_mutex);
list_add(&prot->node, &proto_list);
assign_proto_idx(prot);
- write_unlock(&proto_list_lock);
+ mutex_unlock(&proto_list_mutex);
return 0;
out_free_timewait_sock_slab_name:
@@ -2431,10 +2510,10 @@ EXPORT_SYMBOL(proto_register);
void proto_unregister(struct proto *prot)
{
- write_lock(&proto_list_lock);
+ mutex_lock(&proto_list_mutex);
release_proto_idx(prot);
list_del(&prot->node);
- write_unlock(&proto_list_lock);
+ mutex_unlock(&proto_list_mutex);
if (prot->slab != NULL) {
kmem_cache_destroy(prot->slab);
@@ -2457,9 +2536,9 @@ EXPORT_SYMBOL(proto_unregister);
#ifdef CONFIG_PROC_FS
static void *proto_seq_start(struct seq_file *seq, loff_t *pos)
- __acquires(proto_list_lock)
+ __acquires(proto_list_mutex)
{
- read_lock(&proto_list_lock);
+ mutex_lock(&proto_list_mutex);
return seq_list_start_head(&proto_list, *pos);
}
@@ -2469,25 +2548,36 @@ static void *proto_seq_next(struct seq_file *seq, void *v, loff_t *pos)
}
static void proto_seq_stop(struct seq_file *seq, void *v)
- __releases(proto_list_lock)
+ __releases(proto_list_mutex)
{
- read_unlock(&proto_list_lock);
+ mutex_unlock(&proto_list_mutex);
}
static char proto_method_implemented(const void *method)
{
return method == NULL ? 'n' : 'y';
}
+static long sock_prot_memory_allocated(struct proto *proto)
+{
+ return proto->memory_allocated != NULL ? proto_memory_allocated(proto): -1L;
+}
+
+static char *sock_prot_memory_pressure(struct proto *proto)
+{
+ return proto->memory_pressure != NULL ?
+ proto_memory_pressure(proto) ? "yes" : "no" : "NI";
+}
static void proto_seq_printf(struct seq_file *seq, struct proto *proto)
{
+
seq_printf(seq, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
"%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
proto->name,
proto->obj_size,
sock_prot_inuse_get(seq_file_net(seq), proto),
- proto->memory_allocated != NULL ? atomic_long_read(proto->memory_allocated) : -1L,
- proto->memory_pressure != NULL ? *proto->memory_pressure ? "yes" : "no" : "NI",
+ sock_prot_memory_allocated(proto),
+ sock_prot_memory_pressure(proto),
proto->max_header,
proto->slab == NULL ? "no" : "yes",
module_name(proto->owner),
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
new file mode 100644
index 00000000000..b9868e1fd62
--- /dev/null
+++ b/net/core/sock_diag.c
@@ -0,0 +1,192 @@
+#include <linux/mutex.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <net/netlink.h>
+#include <net/net_namespace.h>
+#include <linux/module.h>
+#include <linux/rtnetlink.h>
+#include <net/sock.h>
+
+#include <linux/inet_diag.h>
+#include <linux/sock_diag.h>
+
+static struct sock_diag_handler *sock_diag_handlers[AF_MAX];
+static int (*inet_rcv_compat)(struct sk_buff *skb, struct nlmsghdr *nlh);
+static DEFINE_MUTEX(sock_diag_table_mutex);
+
+int sock_diag_check_cookie(void *sk, __u32 *cookie)
+{
+ if ((cookie[0] != INET_DIAG_NOCOOKIE ||
+ cookie[1] != INET_DIAG_NOCOOKIE) &&
+ ((u32)(unsigned long)sk != cookie[0] ||
+ (u32)((((unsigned long)sk) >> 31) >> 1) != cookie[1]))
+ return -ESTALE;
+ else
+ return 0;
+}
+EXPORT_SYMBOL_GPL(sock_diag_check_cookie);
+
+void sock_diag_save_cookie(void *sk, __u32 *cookie)
+{
+ cookie[0] = (u32)(unsigned long)sk;
+ cookie[1] = (u32)(((unsigned long)sk >> 31) >> 1);
+}
+EXPORT_SYMBOL_GPL(sock_diag_save_cookie);
+
+int sock_diag_put_meminfo(struct sock *sk, struct sk_buff *skb, int attrtype)
+{
+ __u32 *mem;
+
+ mem = RTA_DATA(__RTA_PUT(skb, attrtype, SK_MEMINFO_VARS * sizeof(__u32)));
+
+ mem[SK_MEMINFO_RMEM_ALLOC] = sk_rmem_alloc_get(sk);
+ mem[SK_MEMINFO_RCVBUF] = sk->sk_rcvbuf;
+ mem[SK_MEMINFO_WMEM_ALLOC] = sk_wmem_alloc_get(sk);
+ mem[SK_MEMINFO_SNDBUF] = sk->sk_sndbuf;
+ mem[SK_MEMINFO_FWD_ALLOC] = sk->sk_forward_alloc;
+ mem[SK_MEMINFO_WMEM_QUEUED] = sk->sk_wmem_queued;
+ mem[SK_MEMINFO_OPTMEM] = atomic_read(&sk->sk_omem_alloc);
+
+ return 0;
+
+rtattr_failure:
+ return -EMSGSIZE;
+}
+EXPORT_SYMBOL_GPL(sock_diag_put_meminfo);
+
+void sock_diag_register_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
+{
+ mutex_lock(&sock_diag_table_mutex);
+ inet_rcv_compat = fn;
+ mutex_unlock(&sock_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(sock_diag_register_inet_compat);
+
+void sock_diag_unregister_inet_compat(int (*fn)(struct sk_buff *skb, struct nlmsghdr *nlh))
+{
+ mutex_lock(&sock_diag_table_mutex);
+ inet_rcv_compat = NULL;
+ mutex_unlock(&sock_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(sock_diag_unregister_inet_compat);
+
+int sock_diag_register(struct sock_diag_handler *hndl)
+{
+ int err = 0;
+
+ if (hndl->family >= AF_MAX)
+ return -EINVAL;
+
+ mutex_lock(&sock_diag_table_mutex);
+ if (sock_diag_handlers[hndl->family])
+ err = -EBUSY;
+ else
+ sock_diag_handlers[hndl->family] = hndl;
+ mutex_unlock(&sock_diag_table_mutex);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(sock_diag_register);
+
+void sock_diag_unregister(struct sock_diag_handler *hnld)
+{
+ int family = hnld->family;
+
+ if (family >= AF_MAX)
+ return;
+
+ mutex_lock(&sock_diag_table_mutex);
+ BUG_ON(sock_diag_handlers[family] != hnld);
+ sock_diag_handlers[family] = NULL;
+ mutex_unlock(&sock_diag_table_mutex);
+}
+EXPORT_SYMBOL_GPL(sock_diag_unregister);
+
+static inline struct sock_diag_handler *sock_diag_lock_handler(int family)
+{
+ if (sock_diag_handlers[family] == NULL)
+ request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, family);
+
+ mutex_lock(&sock_diag_table_mutex);
+ return sock_diag_handlers[family];
+}
+
+static inline void sock_diag_unlock_handler(struct sock_diag_handler *h)
+{
+ mutex_unlock(&sock_diag_table_mutex);
+}
+
+static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int err;
+ struct sock_diag_req *req = NLMSG_DATA(nlh);
+ struct sock_diag_handler *hndl;
+
+ if (nlmsg_len(nlh) < sizeof(*req))
+ return -EINVAL;
+
+ hndl = sock_diag_lock_handler(req->sdiag_family);
+ if (hndl == NULL)
+ err = -ENOENT;
+ else
+ err = hndl->dump(skb, nlh);
+ sock_diag_unlock_handler(hndl);
+
+ return err;
+}
+
+static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ int ret;
+
+ switch (nlh->nlmsg_type) {
+ case TCPDIAG_GETSOCK:
+ case DCCPDIAG_GETSOCK:
+ if (inet_rcv_compat == NULL)
+ request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK,
+ NETLINK_SOCK_DIAG, AF_INET);
+
+ mutex_lock(&sock_diag_table_mutex);
+ if (inet_rcv_compat != NULL)
+ ret = inet_rcv_compat(skb, nlh);
+ else
+ ret = -EOPNOTSUPP;
+ mutex_unlock(&sock_diag_table_mutex);
+
+ return ret;
+ case SOCK_DIAG_BY_FAMILY:
+ return __sock_diag_rcv_msg(skb, nlh);
+ default:
+ return -EINVAL;
+ }
+}
+
+static DEFINE_MUTEX(sock_diag_mutex);
+
+static void sock_diag_rcv(struct sk_buff *skb)
+{
+ mutex_lock(&sock_diag_mutex);
+ netlink_rcv_skb(skb, &sock_diag_rcv_msg);
+ mutex_unlock(&sock_diag_mutex);
+}
+
+struct sock *sock_diag_nlsk;
+EXPORT_SYMBOL_GPL(sock_diag_nlsk);
+
+static int __init sock_diag_init(void)
+{
+ sock_diag_nlsk = netlink_kernel_create(&init_net, NETLINK_SOCK_DIAG, 0,
+ sock_diag_rcv, NULL, THIS_MODULE);
+ return sock_diag_nlsk == NULL ? -ENOMEM : 0;
+}
+
+static void __exit sock_diag_exit(void)
+{
+ netlink_kernel_release(sock_diag_nlsk);
+}
+
+module_init(sock_diag_init);
+module_exit(sock_diag_exit);
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_SOCK_DIAG);
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 77a65f03148..d05559d4d9c 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -68,8 +68,13 @@ static int rps_sock_flow_sysctl(ctl_table *table, int write,
if (sock_table != orig_sock_table) {
rcu_assign_pointer(rps_sock_flow_table, sock_table);
- synchronize_rcu();
- vfree(orig_sock_table);
+ if (sock_table)
+ jump_label_inc(&rps_needed);
+ if (orig_sock_table) {
+ jump_label_dec(&rps_needed);
+ synchronize_rcu();
+ vfree(orig_sock_table);
+ }
}
}
diff --git a/net/core/timestamping.c b/net/core/timestamping.c
index 98a52640e7c..661b5a40ec1 100644
--- a/net/core/timestamping.c
+++ b/net/core/timestamping.c
@@ -21,6 +21,7 @@
#include <linux/phy.h>
#include <linux/ptp_classify.h>
#include <linux/skbuff.h>
+#include <linux/export.h>
static struct sock_filter ptp_filter[] = {
PTP_FILTER
@@ -57,9 +58,13 @@ void skb_clone_tx_timestamp(struct sk_buff *skb)
case PTP_CLASS_V2_VLAN:
phydev = skb->dev->phydev;
if (likely(phydev->drv->txtstamp)) {
+ if (!atomic_inc_not_zero(&sk->sk_refcnt))
+ return;
clone = skb_clone(skb, GFP_ATOMIC);
- if (!clone)
+ if (!clone) {
+ sock_put(sk);
return;
+ }
clone->sk = sk;
phydev->drv->txtstamp(phydev, clone, type);
}
@@ -77,8 +82,11 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
struct sock_exterr_skb *serr;
int err;
- if (!hwtstamps)
+ if (!hwtstamps) {
+ sock_put(sk);
+ kfree_skb(skb);
return;
+ }
*skb_hwtstamps(skb) = *hwtstamps;
serr = SKB_EXT_ERR(skb);
@@ -87,6 +95,7 @@ void skb_complete_tx_timestamp(struct sk_buff *skb,
serr->ee.ee_origin = SO_EE_ORIGIN_TIMESTAMPING;
skb->sk = NULL;
err = sock_queue_err_skb(sk, skb);
+ sock_put(sk);
if (err)
kfree_skb(skb);
}
diff --git a/net/core/user_dma.c b/net/core/user_dma.c
index 25d717ebc92..1b5fefdb819 100644
--- a/net/core/user_dma.c
+++ b/net/core/user_dma.c
@@ -27,6 +27,7 @@
#include <linux/dmaengine.h>
#include <linux/socket.h>
+#include <linux/export.h>
#include <net/tcp.h>
#include <net/netdma.h>
@@ -71,14 +72,14 @@ int dma_skb_copy_datagram_iovec(struct dma_chan *chan,
/* Copy paged appendix. Hmm... why does this look so complicated? */
for (i = 0; i < skb_shinfo(skb)->nr_frags; i++) {
int end;
+ const skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
WARN_ON(start > offset + len);
- end = start + skb_shinfo(skb)->frags[i].size;
+ end = start + skb_frag_size(frag);
copy = end - offset;
if (copy > 0) {
- skb_frag_t *frag = &skb_shinfo(skb)->frags[i];
- struct page *page = frag->page;
+ struct page *page = skb_frag_page(frag);
if (copy > len)
copy = len;