summaryrefslogtreecommitdiff
path: root/drivers/net/ethernet/mellanox/mlx4/en_rx.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-12-11 14:27:06 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2014-12-11 14:27:06 -0800
commit70e71ca0af244f48a5dcf56dc435243792e3a495 (patch)
treef7d9c4c4d9a857a00043e9bf6aa2d6f533a34778 /drivers/net/ethernet/mellanox/mlx4/en_rx.c
parentbae41e45b7400496b9bf0c70c6004419d9987819 (diff)
parent00c83b01d58068dfeb2e1351cca6fccf2a83fa8f (diff)
Merge git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next
Pull networking updates from David Miller: 1) New offloading infrastructure and example 'rocker' driver for offloading of switching and routing to hardware. This work was done by a large group of dedicated individuals, not limited to: Scott Feldman, Jiri Pirko, Thomas Graf, John Fastabend, Jamal Hadi Salim, Andy Gospodarek, Florian Fainelli, Roopa Prabhu 2) Start making the networking operate on IOV iterators instead of modifying iov objects in-situ during transfers. Thanks to Al Viro and Herbert Xu. 3) A set of new netlink interfaces for the TIPC stack, from Richard Alpe. 4) Remove unnecessary looping during ipv6 routing lookups, from Martin KaFai Lau. 5) Add PAUSE frame generation support to gianfar driver, from Matei Pavaluca. 6) Allow for larger reordering levels in TCP, which are easily achievable in the real world right now, from Eric Dumazet. 7) Add a variable of napi_schedule that doesn't need to disable cpu interrupts, from Eric Dumazet. 8) Use a doubly linked list to optimize neigh_parms_release(), from Nicolas Dichtel. 9) Various enhancements to the kernel BPF verifier, and allow eBPF programs to actually be attached to sockets. From Alexei Starovoitov. 10) Support TSO/LSO in sunvnet driver, from David L Stevens. 11) Allow controlling ECN usage via routing metrics, from Florian Westphal. 12) Remote checksum offload, from Tom Herbert. 13) Add split-header receive, BQL, and xmit_more support to amd-xgbe driver, from Thomas Lendacky. 14) Add MPLS support to openvswitch, from Simon Horman. 15) Support wildcard tunnel endpoints in ipv6 tunnels, from Steffen Klassert. 16) Do gro flushes on a per-device basis using a timer, from Eric Dumazet. This tries to resolve the conflicting goals between the desired handling of bulk vs. RPC-like traffic. 17) Allow userspace to ask for the CPU upon what a packet was received/steered, via SO_INCOMING_CPU. From Eric Dumazet. 18) Limit GSO packets to half the current congestion window, from Eric Dumazet. 19) Add a generic helper so that all drivers set their RSS keys in a consistent way, from Eric Dumazet. 20) Add xmit_more support to enic driver, from Govindarajulu Varadarajan. 21) Add VLAN packet scheduler action, from Jiri Pirko. 22) Support configurable RSS hash functions via ethtool, from Eyal Perry. * git://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next: (1820 commits) Fix race condition between vxlan_sock_add and vxlan_sock_release net/macb: fix compilation warning for print_hex_dump() called with skb->mac_header net/mlx4: Add support for A0 steering net/mlx4: Refactor QUERY_PORT net/mlx4_core: Add explicit error message when rule doesn't meet configuration net/mlx4: Add A0 hybrid steering net/mlx4: Add mlx4_bitmap zone allocator net/mlx4: Add a check if there are too many reserved QPs net/mlx4: Change QP allocation scheme net/mlx4_core: Use tasklet for user-space CQ completion events net/mlx4_core: Mask out host side virtualization features for guests net/mlx4_en: Set csum level for encapsulated packets be2net: Export tunnel offloads only when a VxLAN tunnel is created gianfar: Fix dma check map error when DMA_API_DEBUG is enabled cxgb4/csiostor: Don't use MASTER_MUST for fw_hello call net: fec: only enable mdio interrupt before phy device link up net: fec: clear all interrupt events to support i.MX6SX net: fec: reset fep link status in suspend function net: sock: fix access via invalid file descriptor net: introduce helper macro for_each_cmsghdr ...
Diffstat (limited to 'drivers/net/ethernet/mellanox/mlx4/en_rx.c')
-rw-r--r--drivers/net/ethernet/mellanox/mlx4/en_rx.c303
1 files changed, 208 insertions, 95 deletions
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 01660c595f5c..a0474eb94aa3 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -42,6 +42,10 @@
#include <linux/vmalloc.h>
#include <linux/irq.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ip6_checksum.h>
+#endif
+
#include "mlx4_en.h"
static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
@@ -74,7 +78,7 @@ static int mlx4_alloc_pages(struct mlx4_en_priv *priv,
page_alloc->page_size = PAGE_SIZE << order;
page_alloc->page = page;
page_alloc->dma = dma;
- page_alloc->page_offset = frag_info->frag_align;
+ page_alloc->page_offset = 0;
/* Not doing get_page() for each frag is a big win
* on asymetric workloads. Note we can not use atomic_set().
*/
@@ -119,7 +123,6 @@ static int mlx4_en_alloc_frags(struct mlx4_en_priv *priv,
out:
while (i--) {
- frag_info = &priv->frag_info[i];
if (page_alloc[i].page != ring_alloc[i].page) {
dma_unmap_page(priv->ddev, page_alloc[i].dma,
page_alloc[i].page_size, PCI_DMA_FROMDEVICE);
@@ -157,7 +160,7 @@ static int mlx4_en_init_allocator(struct mlx4_en_priv *priv,
const struct mlx4_en_frag_info *frag_info = &priv->frag_info[i];
if (mlx4_alloc_pages(priv, &ring->page_alloc[i],
- frag_info, GFP_KERNEL))
+ frag_info, GFP_KERNEL | __GFP_COLD))
goto out;
}
return 0;
@@ -269,7 +272,7 @@ static int mlx4_en_fill_rx_buffers(struct mlx4_en_priv *priv)
if (mlx4_en_prepare_rx_desc(priv, ring,
ring->actual_size,
- GFP_KERNEL)) {
+ GFP_KERNEL | __GFP_COLD)) {
if (ring->actual_size < MLX4_EN_MIN_RX_SIZE) {
en_err(priv, "Failed to allocate enough rx buffers\n");
return -ENOMEM;
@@ -636,13 +639,94 @@ static void mlx4_en_refill_rx_buffers(struct mlx4_en_priv *priv,
int index = ring->prod & ring->size_mask;
while ((u32) (ring->prod - ring->cons) < ring->actual_size) {
- if (mlx4_en_prepare_rx_desc(priv, ring, index, GFP_ATOMIC))
+ if (mlx4_en_prepare_rx_desc(priv, ring, index,
+ GFP_ATOMIC | __GFP_COLD))
break;
ring->prod++;
index = ring->prod & ring->size_mask;
}
}
+/* When hardware doesn't strip the vlan, we need to calculate the checksum
+ * over it and add it to the hardware's checksum calculation
+ */
+static inline __wsum get_fixed_vlan_csum(__wsum hw_checksum,
+ struct vlan_hdr *vlanh)
+{
+ return csum_add(hw_checksum, *(__wsum *)vlanh);
+}
+
+/* Although the stack expects checksum which doesn't include the pseudo
+ * header, the HW adds it. To address that, we are subtracting the pseudo
+ * header checksum from the checksum value provided by the HW.
+ */
+static void get_fixed_ipv4_csum(__wsum hw_checksum, struct sk_buff *skb,
+ struct iphdr *iph)
+{
+ __u16 length_for_csum = 0;
+ __wsum csum_pseudo_header = 0;
+
+ length_for_csum = (be16_to_cpu(iph->tot_len) - (iph->ihl << 2));
+ csum_pseudo_header = csum_tcpudp_nofold(iph->saddr, iph->daddr,
+ length_for_csum, iph->protocol, 0);
+ skb->csum = csum_sub(hw_checksum, csum_pseudo_header);
+}
+
+#if IS_ENABLED(CONFIG_IPV6)
+/* In IPv6 packets, besides subtracting the pseudo header checksum,
+ * we also compute/add the IP header checksum which
+ * is not added by the HW.
+ */
+static int get_fixed_ipv6_csum(__wsum hw_checksum, struct sk_buff *skb,
+ struct ipv6hdr *ipv6h)
+{
+ __wsum csum_pseudo_hdr = 0;
+
+ if (ipv6h->nexthdr == IPPROTO_FRAGMENT || ipv6h->nexthdr == IPPROTO_HOPOPTS)
+ return -1;
+ hw_checksum = csum_add(hw_checksum, (__force __wsum)(ipv6h->nexthdr << 8));
+
+ csum_pseudo_hdr = csum_partial(&ipv6h->saddr,
+ sizeof(ipv6h->saddr) + sizeof(ipv6h->daddr), 0);
+ csum_pseudo_hdr = csum_add(csum_pseudo_hdr, (__force __wsum)ipv6h->payload_len);
+ csum_pseudo_hdr = csum_add(csum_pseudo_hdr, (__force __wsum)ntohs(ipv6h->nexthdr));
+
+ skb->csum = csum_sub(hw_checksum, csum_pseudo_hdr);
+ skb->csum = csum_add(skb->csum, csum_partial(ipv6h, sizeof(struct ipv6hdr), 0));
+ return 0;
+}
+#endif
+static int check_csum(struct mlx4_cqe *cqe, struct sk_buff *skb, void *va,
+ int hwtstamp_rx_filter)
+{
+ __wsum hw_checksum = 0;
+
+ void *hdr = (u8 *)va + sizeof(struct ethhdr);
+
+ hw_checksum = csum_unfold((__force __sum16)cqe->checksum);
+
+ if (((struct ethhdr *)va)->h_proto == htons(ETH_P_8021Q) &&
+ hwtstamp_rx_filter != HWTSTAMP_FILTER_NONE) {
+ /* next protocol non IPv4 or IPv6 */
+ if (((struct vlan_hdr *)hdr)->h_vlan_encapsulated_proto
+ != htons(ETH_P_IP) &&
+ ((struct vlan_hdr *)hdr)->h_vlan_encapsulated_proto
+ != htons(ETH_P_IPV6))
+ return -1;
+ hw_checksum = get_fixed_vlan_csum(hw_checksum, hdr);
+ hdr += sizeof(struct vlan_hdr);
+ }
+
+ if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV4))
+ get_fixed_ipv4_csum(hw_checksum, skb, hdr);
+#if IS_ENABLED(CONFIG_IPV6)
+ else if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV6))
+ if (get_fixed_ipv6_csum(hw_checksum, skb, hdr))
+ return -1;
+#endif
+ return 0;
+}
+
int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int budget)
{
struct mlx4_en_priv *priv = netdev_priv(dev);
@@ -744,73 +828,96 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
(cqe->vlan_my_qpn & cpu_to_be32(MLX4_CQE_L2_TUNNEL));
if (likely(dev->features & NETIF_F_RXCSUM)) {
- if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) &&
- (cqe->checksum == cpu_to_be16(0xffff))) {
- ring->csum_ok++;
- /* This packet is eligible for GRO if it is:
- * - DIX Ethernet (type interpretation)
- * - TCP/IP (v4)
- * - without IP options
- * - not an IP fragment
- * - no LLS polling in progress
- */
- if (!mlx4_en_cq_busy_polling(cq) &&
- (dev->features & NETIF_F_GRO)) {
- struct sk_buff *gro_skb = napi_get_frags(&cq->napi);
- if (!gro_skb)
- goto next;
-
- nr = mlx4_en_complete_rx_desc(priv,
- rx_desc, frags, gro_skb,
- length);
- if (!nr)
- goto next;
+ if (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_TCP |
+ MLX4_CQE_STATUS_UDP)) {
+ if ((cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPOK)) &&
+ cqe->checksum == cpu_to_be16(0xffff)) {
+ ip_summed = CHECKSUM_UNNECESSARY;
+ ring->csum_ok++;
+ } else {
+ ip_summed = CHECKSUM_NONE;
+ ring->csum_none++;
+ }
+ } else {
+ if (priv->flags & MLX4_EN_FLAG_RX_CSUM_NON_TCP_UDP &&
+ (cqe->status & cpu_to_be16(MLX4_CQE_STATUS_IPV4 |
+ MLX4_CQE_STATUS_IPV6))) {
+ ip_summed = CHECKSUM_COMPLETE;
+ ring->csum_complete++;
+ } else {
+ ip_summed = CHECKSUM_NONE;
+ ring->csum_none++;
+ }
+ }
+ } else {
+ ip_summed = CHECKSUM_NONE;
+ ring->csum_none++;
+ }
- skb_shinfo(gro_skb)->nr_frags = nr;
- gro_skb->len = length;
- gro_skb->data_len = length;
- gro_skb->ip_summed = CHECKSUM_UNNECESSARY;
+ /* This packet is eligible for GRO if it is:
+ * - DIX Ethernet (type interpretation)
+ * - TCP/IP (v4)
+ * - without IP options
+ * - not an IP fragment
+ * - no LLS polling in progress
+ */
+ if (!mlx4_en_cq_busy_polling(cq) &&
+ (dev->features & NETIF_F_GRO)) {
+ struct sk_buff *gro_skb = napi_get_frags(&cq->napi);
+ if (!gro_skb)
+ goto next;
+
+ nr = mlx4_en_complete_rx_desc(priv,
+ rx_desc, frags, gro_skb,
+ length);
+ if (!nr)
+ goto next;
+
+ if (ip_summed == CHECKSUM_COMPLETE) {
+ void *va = skb_frag_address(skb_shinfo(gro_skb)->frags);
+ if (check_csum(cqe, gro_skb, va, ring->hwtstamp_rx_filter)) {
+ ip_summed = CHECKSUM_NONE;
+ ring->csum_none++;
+ ring->csum_complete--;
+ }
+ }
- if (l2_tunnel)
- gro_skb->csum_level = 1;
- if ((cqe->vlan_my_qpn &
- cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK)) &&
- (dev->features & NETIF_F_HW_VLAN_CTAG_RX)) {
- u16 vid = be16_to_cpu(cqe->sl_vid);
+ skb_shinfo(gro_skb)->nr_frags = nr;
+ gro_skb->len = length;
+ gro_skb->data_len = length;
+ gro_skb->ip_summed = ip_summed;
- __vlan_hwaccel_put_tag(gro_skb, htons(ETH_P_8021Q), vid);
- }
+ if (l2_tunnel && ip_summed == CHECKSUM_UNNECESSARY)
+ gro_skb->csum_level = 1;
- if (dev->features & NETIF_F_RXHASH)
- skb_set_hash(gro_skb,
- be32_to_cpu(cqe->immed_rss_invalid),
- PKT_HASH_TYPE_L3);
+ if ((cqe->vlan_my_qpn &
+ cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK)) &&
+ (dev->features & NETIF_F_HW_VLAN_CTAG_RX)) {
+ u16 vid = be16_to_cpu(cqe->sl_vid);
- skb_record_rx_queue(gro_skb, cq->ring);
- skb_mark_napi_id(gro_skb, &cq->napi);
+ __vlan_hwaccel_put_tag(gro_skb, htons(ETH_P_8021Q), vid);
+ }
- if (ring->hwtstamp_rx_filter == HWTSTAMP_FILTER_ALL) {
- timestamp = mlx4_en_get_cqe_ts(cqe);
- mlx4_en_fill_hwtstamps(mdev,
- skb_hwtstamps(gro_skb),
- timestamp);
- }
+ if (dev->features & NETIF_F_RXHASH)
+ skb_set_hash(gro_skb,
+ be32_to_cpu(cqe->immed_rss_invalid),
+ PKT_HASH_TYPE_L3);
- napi_gro_frags(&cq->napi);
- goto next;
- }
+ skb_record_rx_queue(gro_skb, cq->ring);
+ skb_mark_napi_id(gro_skb, &cq->napi);
- /* GRO not possible, complete processing here */
- ip_summed = CHECKSUM_UNNECESSARY;
- } else {
- ip_summed = CHECKSUM_NONE;
- ring->csum_none++;
+ if (ring->hwtstamp_rx_filter == HWTSTAMP_FILTER_ALL) {
+ timestamp = mlx4_en_get_cqe_ts(cqe);
+ mlx4_en_fill_hwtstamps(mdev,
+ skb_hwtstamps(gro_skb),
+ timestamp);
}
- } else {
- ip_summed = CHECKSUM_NONE;
- ring->csum_none++;
+
+ napi_gro_frags(&cq->napi);
+ goto next;
}
+ /* GRO not possible, complete processing here */
skb = mlx4_en_rx_skb(priv, rx_desc, frags, length);
if (!skb) {
priv->stats.rx_dropped++;
@@ -822,6 +929,14 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud
goto next;
}
+ if (ip_summed == CHECKSUM_COMPLETE) {
+ if (check_csum(cqe, skb, skb->data, ring->hwtstamp_rx_filter)) {
+ ip_summed = CHECKSUM_NONE;
+ ring->csum_complete--;
+ ring->csum_none++;
+ }
+ }
+
skb->ip_summed = ip_summed;
skb->protocol = eth_type_trans(skb, dev);
skb_record_rx_queue(skb, cq->ring);
@@ -879,8 +994,8 @@ void mlx4_en_rx_irq(struct mlx4_cq *mcq)
struct mlx4_en_cq *cq = container_of(mcq, struct mlx4_en_cq, mcq);
struct mlx4_en_priv *priv = netdev_priv(cq->dev);
- if (priv->port_up)
- napi_schedule(&cq->napi);
+ if (likely(priv->port_up))
+ napi_schedule_irqoff(&cq->napi);
else
mlx4_en_arm_cq(priv, cq);
}
@@ -910,20 +1025,18 @@ int mlx4_en_poll_rx_cq(struct napi_struct *napi, int budget)
cpu_curr = smp_processor_id();
aff = irq_desc_get_irq_data(cq->irq_desc)->affinity;
- if (unlikely(!cpumask_test_cpu(cpu_curr, aff))) {
- /* Current cpu is not according to smp_irq_affinity -
- * probably affinity changed. need to stop this NAPI
- * poll, and restart it on the right CPU
- */
- napi_complete(napi);
- mlx4_en_arm_cq(priv, cq);
- return 0;
- }
- } else {
- /* Done for now */
- napi_complete(napi);
- mlx4_en_arm_cq(priv, cq);
+ if (likely(cpumask_test_cpu(cpu_curr, aff)))
+ return budget;
+
+ /* Current cpu is not according to smp_irq_affinity -
+ * probably affinity changed. need to stop this NAPI
+ * poll, and restart it on the right CPU
+ */
+ done = 0;
}
+ /* Done for now */
+ napi_complete_done(napi, done);
+ mlx4_en_arm_cq(priv, cq);
return done;
}
@@ -946,15 +1059,8 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
(eff_mtu > buf_size + frag_sizes[i]) ?
frag_sizes[i] : eff_mtu - buf_size;
priv->frag_info[i].frag_prefix_size = buf_size;
- if (!i) {
- priv->frag_info[i].frag_align = NET_IP_ALIGN;
- priv->frag_info[i].frag_stride =
- ALIGN(frag_sizes[i] + NET_IP_ALIGN, SMP_CACHE_BYTES);
- } else {
- priv->frag_info[i].frag_align = 0;
- priv->frag_info[i].frag_stride =
- ALIGN(frag_sizes[i], SMP_CACHE_BYTES);
- }
+ priv->frag_info[i].frag_stride = ALIGN(frag_sizes[i],
+ SMP_CACHE_BYTES);
buf_size += priv->frag_info[i].frag_size;
i++;
}
@@ -967,11 +1073,10 @@ void mlx4_en_calc_rx_buf(struct net_device *dev)
eff_mtu, priv->num_frags);
for (i = 0; i < priv->num_frags; i++) {
en_err(priv,
- " frag:%d - size:%d prefix:%d align:%d stride:%d\n",
+ " frag:%d - size:%d prefix:%d stride:%d\n",
i,
priv->frag_info[i].frag_size,
priv->frag_info[i].frag_prefix_size,
- priv->frag_info[i].frag_align,
priv->frag_info[i].frag_stride);
}
}
@@ -1026,7 +1131,8 @@ int mlx4_en_create_drop_qp(struct mlx4_en_priv *priv)
int err;
u32 qpn;
- err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn);
+ err = mlx4_qp_reserve_range(priv->mdev->dev, 1, 1, &qpn,
+ MLX4_RESERVE_A0_QP);
if (err) {
en_err(priv, "Failed reserving drop qpn\n");
return err;
@@ -1065,14 +1171,11 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv)
int i, qpn;
int err = 0;
int good_qps = 0;
- static const u32 rsskey[10] = { 0xD181C62C, 0xF7F4DB5B, 0x1983A2FC,
- 0x943E1ADB, 0xD9389E6B, 0xD1039C2C, 0xA74499AD,
- 0x593D56D9, 0xF3253C06, 0x2ADC1FFC};
en_dbg(DRV, priv, "Configuring rss steering\n");
err = mlx4_qp_reserve_range(mdev->dev, priv->rx_ring_num,
priv->rx_ring_num,
- &rss_map->base_qpn);
+ &rss_map->base_qpn, 0);
if (err) {
en_err(priv, "Failed reserving %d qps\n", priv->rx_ring_num);
return err;
@@ -1122,9 +1225,19 @@ int mlx4_en_config_rss_steer(struct mlx4_en_priv *priv)
rss_context->flags = rss_mask;
rss_context->hash_fn = MLX4_RSS_HASH_TOP;
- for (i = 0; i < 10; i++)
- rss_context->rss_key[i] = cpu_to_be32(rsskey[i]);
-
+ if (priv->rss_hash_fn == ETH_RSS_HASH_XOR) {
+ rss_context->hash_fn = MLX4_RSS_HASH_XOR;
+ } else if (priv->rss_hash_fn == ETH_RSS_HASH_TOP) {
+ rss_context->hash_fn = MLX4_RSS_HASH_TOP;
+ memcpy(rss_context->rss_key, priv->rss_key,
+ MLX4_EN_RSS_KEY_SIZE);
+ netdev_rss_key_fill(rss_context->rss_key,
+ MLX4_EN_RSS_KEY_SIZE);
+ } else {
+ en_err(priv, "Unknown RSS hash function requested\n");
+ err = -EINVAL;
+ goto indir_err;
+ }
err = mlx4_qp_to_ready(mdev->dev, &priv->res.mtt, &context,
&rss_map->indir_qp, &rss_map->indir_state);
if (err)