diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2011-03-17 19:34:12 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2011-03-17 19:34:12 -0700 |
commit | 08351fc6a75731226e1112fc7254542bd3a2912e (patch) | |
tree | 8b25bd168e0663c766f0332c8be082aa7d6ed265 /drivers | |
parent | 0df0914d414a504b975f3cc66ace0c16ef55b7f3 (diff) | |
parent | 0dccb0489f9a5a13a33e828ab965aa49685d12f8 (diff) |
Merge git://git.kernel.org/pub/scm/linux/kernel/git/cmetcalf/linux-tile
* git://git.kernel.org/pub/scm/linux/kernel/git/cmetcalf/linux-tile: (27 commits)
arch/tile: support newer binutils assembler shift semantics
arch/tile: fix deadlock bugs in rwlock implementation
drivers/edac: provide support for tile architecture
tile on-chip network driver: sync up with latest fixes
arch/tile: support 4KB page size as well as 64KB
arch/tile: add some more VMSPLIT options and use consistent naming
arch/tile: fix some comments and whitespace
arch/tile: export some additional module symbols
arch/tile: enhance existing finv_buffer_remote() routine
arch/tile: fix two bugs in the backtracer code
arch/tile: use extended assembly to inline __mb_incoherent()
arch/tile: use a cleaner technique to enable interrupt for cpu_idle()
arch/tile: sync up with <arch/sim.h> and <arch/sim_def.h> changes
arch/tile: fix reversed test of strict_strtol() return value
arch/tile: avoid a simulator warning during bootup
arch/tile: export <asm/hardwall.h> to userspace
arch/tile: warn and retry if an IPI is not accepted by the target cpu
arch/tile: stop disabling INTCTRL_1 interrupts during hypervisor downcalls
arch/tile: fix __ndelay etc to work better
arch/tile: bug fix: exec'ed task thought it was still single-stepping
...
Fix up trivial conflict in arch/tile/kernel/vmlinux.lds.S (percpu
alignment vs section naming convention fix)
Diffstat (limited to 'drivers')
-rw-r--r-- | drivers/edac/Kconfig | 10 | ||||
-rw-r--r-- | drivers/edac/Makefile | 1 | ||||
-rw-r--r-- | drivers/edac/tile_edac.c | 254 | ||||
-rw-r--r-- | drivers/net/tile/tilepro.c | 965 |
4 files changed, 781 insertions, 449 deletions
diff --git a/drivers/edac/Kconfig b/drivers/edac/Kconfig index fe70a341bd8..fac1a2002e6 100644 --- a/drivers/edac/Kconfig +++ b/drivers/edac/Kconfig @@ -7,7 +7,7 @@ menuconfig EDAC bool "EDAC (Error Detection And Correction) reporting" depends on HAS_IOMEM - depends on X86 || PPC + depends on X86 || PPC || TILE help EDAC is designed to report errors in the core system. These are low-level errors that are reported in the CPU or @@ -282,4 +282,12 @@ config EDAC_CPC925 a companion chip to the PowerPC 970 family of processors. +config EDAC_TILE + tristate "Tilera Memory Controller" + depends on EDAC_MM_EDAC && TILE + default y + help + Support for error detection and correction on the + Tilera memory controller. + endif # EDAC diff --git a/drivers/edac/Makefile b/drivers/edac/Makefile index ba2898b3639..3e239133e29 100644 --- a/drivers/edac/Makefile +++ b/drivers/edac/Makefile @@ -54,3 +54,4 @@ obj-$(CONFIG_EDAC_PPC4XX) += ppc4xx_edac.o obj-$(CONFIG_EDAC_AMD8111) += amd8111_edac.o obj-$(CONFIG_EDAC_AMD8131) += amd8131_edac.o +obj-$(CONFIG_EDAC_TILE) += tile_edac.o diff --git a/drivers/edac/tile_edac.c b/drivers/edac/tile_edac.c new file mode 100644 index 00000000000..1d5cf06f6c6 --- /dev/null +++ b/drivers/edac/tile_edac.c @@ -0,0 +1,254 @@ +/* + * Copyright 2011 Tilera Corporation. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation, version 2. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for + * more details. + * Tilera-specific EDAC driver. + * + * This source code is derived from the following driver: + * + * Cell MIC driver for ECC counting + * + * Copyright 2007 Benjamin Herrenschmidt, IBM Corp. + * <benh@kernel.crashing.org> + * + */ + +#include <linux/module.h> +#include <linux/init.h> +#include <linux/platform_device.h> +#include <linux/io.h> +#include <linux/uaccess.h> +#include <linux/edac.h> +#include <hv/hypervisor.h> +#include <hv/drv_mshim_intf.h> + +#include "edac_core.h" + +#define DRV_NAME "tile-edac" + +/* Number of cs_rows needed per memory controller on TILEPro. */ +#define TILE_EDAC_NR_CSROWS 1 + +/* Number of channels per memory controller on TILEPro. */ +#define TILE_EDAC_NR_CHANS 1 + +/* Granularity of reported error in bytes on TILEPro. */ +#define TILE_EDAC_ERROR_GRAIN 8 + +/* TILE processor has multiple independent memory controllers. */ +struct platform_device *mshim_pdev[TILE_MAX_MSHIMS]; + +struct tile_edac_priv { + int hv_devhdl; /* Hypervisor device handle. */ + int node; /* Memory controller instance #. */ + unsigned int ce_count; /* + * Correctable-error counter + * kept by the driver. + */ +}; + +static void tile_edac_check(struct mem_ctl_info *mci) +{ + struct tile_edac_priv *priv = mci->pvt_info; + struct mshim_mem_error mem_error; + + if (hv_dev_pread(priv->hv_devhdl, 0, (HV_VirtAddr)&mem_error, + sizeof(struct mshim_mem_error), MSHIM_MEM_ERROR_OFF) != + sizeof(struct mshim_mem_error)) { + pr_err(DRV_NAME ": MSHIM_MEM_ERROR_OFF pread failure.\n"); + return; + } + + /* Check if the current error count is different from the saved one. */ + if (mem_error.sbe_count != priv->ce_count) { + dev_dbg(mci->dev, "ECC CE err on node %d\n", priv->node); + priv->ce_count = mem_error.sbe_count; + edac_mc_handle_ce(mci, 0, 0, 0, 0, 0, mci->ctl_name); + } +} + +/* + * Initialize the 'csrows' table within the mci control structure with the + * addressing of memory. + */ +static int __devinit tile_edac_init_csrows(struct mem_ctl_info *mci) +{ + struct csrow_info *csrow = &mci->csrows[0]; + struct tile_edac_priv *priv = mci->pvt_info; + struct mshim_mem_info mem_info; + + if (hv_dev_pread(priv->hv_devhdl, 0, (HV_VirtAddr)&mem_info, + sizeof(struct mshim_mem_info), MSHIM_MEM_INFO_OFF) != + sizeof(struct mshim_mem_info)) { + pr_err(DRV_NAME ": MSHIM_MEM_INFO_OFF pread failure.\n"); + return -1; + } + + if (mem_info.mem_ecc) + csrow->edac_mode = EDAC_SECDED; + else + csrow->edac_mode = EDAC_NONE; + switch (mem_info.mem_type) { + case DDR2: + csrow->mtype = MEM_DDR2; + break; + + case DDR3: + csrow->mtype = MEM_DDR3; + break; + + default: + return -1; + } + + csrow->first_page = 0; + csrow->nr_pages = mem_info.mem_size >> PAGE_SHIFT; + csrow->last_page = csrow->first_page + csrow->nr_pages - 1; + csrow->grain = TILE_EDAC_ERROR_GRAIN; + csrow->dtype = DEV_UNKNOWN; + + return 0; +} + +static int __devinit tile_edac_mc_probe(struct platform_device *pdev) +{ + char hv_file[32]; + int hv_devhdl; + struct mem_ctl_info *mci; + struct tile_edac_priv *priv; + int rc; + + sprintf(hv_file, "mshim/%d", pdev->id); + hv_devhdl = hv_dev_open((HV_VirtAddr)hv_file, 0); + if (hv_devhdl < 0) + return -EINVAL; + + /* A TILE MC has a single channel and one chip-select row. */ + mci = edac_mc_alloc(sizeof(struct tile_edac_priv), + TILE_EDAC_NR_CSROWS, TILE_EDAC_NR_CHANS, pdev->id); + if (mci == NULL) + return -ENOMEM; + priv = mci->pvt_info; + priv->node = pdev->id; + priv->hv_devhdl = hv_devhdl; + + mci->dev = &pdev->dev; + mci->mtype_cap = MEM_FLAG_DDR2; + mci->edac_ctl_cap = EDAC_FLAG_SECDED; + + mci->mod_name = DRV_NAME; + mci->ctl_name = "TILEPro_Memory_Controller"; + mci->dev_name = dev_name(&pdev->dev); + mci->edac_check = tile_edac_check; + + /* + * Initialize the MC control structure 'csrows' table + * with the mapping and control information. + */ + if (tile_edac_init_csrows(mci)) { + /* No csrows found. */ + mci->edac_cap = EDAC_FLAG_NONE; + } else { + mci->edac_cap = EDAC_FLAG_SECDED; + } + + platform_set_drvdata(pdev, mci); + + /* Register with EDAC core */ + rc = edac_mc_add_mc(mci); + if (rc) { + dev_err(&pdev->dev, "failed to register with EDAC core\n"); + edac_mc_free(mci); + return rc; + } + + return 0; +} + +static int __devexit tile_edac_mc_remove(struct platform_device *pdev) +{ + struct mem_ctl_info *mci = platform_get_drvdata(pdev); + + edac_mc_del_mc(&pdev->dev); + if (mci) + edac_mc_free(mci); + return 0; +} + +static struct platform_driver tile_edac_mc_driver = { + .driver = { + .name = DRV_NAME, + .owner = THIS_MODULE, + }, + .probe = tile_edac_mc_probe, + .remove = __devexit_p(tile_edac_mc_remove), +}; + +/* + * Driver init routine. + */ +static int __init tile_edac_init(void) +{ + char hv_file[32]; + struct platform_device *pdev; + int i, err, num = 0; + + /* Only support POLL mode. */ + edac_op_state = EDAC_OPSTATE_POLL; + + err = platform_driver_register(&tile_edac_mc_driver); + if (err) + return err; + + for (i = 0; i < TILE_MAX_MSHIMS; i++) { + /* + * Not all memory controllers are configured such as in the + * case of a simulator. So we register only those mshims + * that are configured by the hypervisor. + */ + sprintf(hv_file, "mshim/%d", i); + if (hv_dev_open((HV_VirtAddr)hv_file, 0) < 0) + continue; + + pdev = platform_device_register_simple(DRV_NAME, i, NULL, 0); + if (IS_ERR(pdev)) + continue; + mshim_pdev[i] = pdev; + num++; + } + + if (num == 0) { + platform_driver_unregister(&tile_edac_mc_driver); + return -ENODEV; + } + return 0; +} + +/* + * Driver cleanup routine. + */ +static void __exit tile_edac_exit(void) +{ + int i; + + for (i = 0; i < TILE_MAX_MSHIMS; i++) { + struct platform_device *pdev = mshim_pdev[i]; + if (!pdev) + continue; + + platform_set_drvdata(pdev, NULL); + platform_device_unregister(pdev); + } + platform_driver_unregister(&tile_edac_mc_driver); +} + +module_init(tile_edac_init); +module_exit(tile_edac_exit); diff --git a/drivers/net/tile/tilepro.c b/drivers/net/tile/tilepro.c index 7cb301da747..0825db6d883 100644 --- a/drivers/net/tile/tilepro.c +++ b/drivers/net/tile/tilepro.c @@ -1,5 +1,5 @@ /* - * Copyright 2010 Tilera Corporation. All Rights Reserved. + * Copyright 2011 Tilera Corporation. All Rights Reserved. * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License @@ -44,10 +44,6 @@ #include <linux/tcp.h> -/* There is no singlethread_cpu, so schedule work on the current cpu. */ -#define singlethread_cpu -1 - - /* * First, "tile_net_init_module()" initializes all four "devices" which * can be used by linux. @@ -73,15 +69,16 @@ * return, knowing we will be called again later. Otherwise, we * reenable the ingress interrupt, and call "napi_complete()". * + * HACK: Since disabling the ingress interrupt is not reliable, we + * ignore the interrupt if the global "active" flag is false. + * * * NOTE: The use of "native_driver" ensures that EPP exists, and that - * "epp_sendv" is legal, and that "LIPP" is being used. + * we are using "LIPP" and "LEPP". * * NOTE: Failing to free completions for an arbitrarily long time * (which is defined to be illegal) does in fact cause bizarre * problems. The "egress_timer" helps prevent this from happening. - * - * NOTE: The egress code can be interrupted by the interrupt handler. */ @@ -142,6 +139,7 @@ MODULE_AUTHOR("Tilera"); MODULE_LICENSE("GPL"); + /* * Queue of incoming packets for a specific cpu and device. * @@ -177,7 +175,7 @@ struct tile_net_cpu { struct tile_netio_queue queue; /* Statistics. */ struct tile_net_stats_t stats; - /* ISSUE: Is this needed? */ + /* True iff NAPI is enabled. */ bool napi_enabled; /* True if this tile has succcessfully registered with the IPP. */ bool registered; @@ -200,20 +198,20 @@ struct tile_net_cpu { struct tile_net_priv { /* Our network device. */ struct net_device *dev; - /* The actual egress queue. */ - lepp_queue_t *epp_queue; - /* Protects "epp_queue->cmd_tail" and "epp_queue->comp_tail" */ - spinlock_t cmd_lock; - /* Protects "epp_queue->comp_head". */ - spinlock_t comp_lock; + /* Pages making up the egress queue. */ + struct page *eq_pages; + /* Address of the actual egress queue. */ + lepp_queue_t *eq; + /* Protects "eq". */ + spinlock_t eq_lock; /* The hypervisor handle for this interface. */ int hv_devhdl; /* The intr bit mask that IDs this device. */ u32 intr_id; /* True iff "tile_net_open_aux()" has succeeded. */ - int partly_opened; - /* True iff "tile_net_open_inner()" has succeeded. */ - int fully_opened; + bool partly_opened; + /* True iff the device is "active". */ + bool active; /* Effective network cpus. */ struct cpumask network_cpus_map; /* Number of network cpus. */ @@ -228,6 +226,10 @@ struct tile_net_priv { struct tile_net_cpu *cpu[NR_CPUS]; }; +/* Log2 of the number of small pages needed for the egress queue. */ +#define EQ_ORDER get_order(sizeof(lepp_queue_t)) +/* Size of the egress queue's pages. */ +#define EQ_SIZE (1 << (PAGE_SHIFT + EQ_ORDER)) /* * The actual devices (xgbe0, xgbe1, gbe0, gbe1). @@ -284,7 +286,11 @@ static void net_printk(char *fmt, ...) */ static void dump_packet(unsigned char *data, unsigned long length, char *s) { + int my_cpu = smp_processor_id(); + unsigned long i; + char buf[128]; + static unsigned int count; pr_info("dump_packet(data %p, length 0x%lx s %s count 0x%x)\n", @@ -294,10 +300,12 @@ static void dump_packet(unsigned char *data, unsigned long length, char *s) for (i = 0; i < length; i++) { if ((i & 0xf) == 0) - sprintf(buf, "%8.8lx:", i); + sprintf(buf, "[%02d] %8.8lx:", my_cpu, i); sprintf(buf + strlen(buf), " %2.2x", data[i]); - if ((i & 0xf) == 0xf || i == length - 1) - pr_info("%s\n", buf); + if ((i & 0xf) == 0xf || i == length - 1) { + strcat(buf, "\n"); + pr_info("%s", buf); + } } } #endif @@ -351,60 +359,109 @@ static void tile_net_provide_linux_buffer(struct tile_net_cpu *info, /* * Provide a linux buffer for LIPP. + * + * Note that the ACTUAL allocation for each buffer is a "struct sk_buff", + * plus a chunk of memory that includes not only the requested bytes, but + * also NET_SKB_PAD bytes of initial padding, and a "struct skb_shared_info". + * + * Note that "struct skb_shared_info" is 88 bytes with 64K pages and + * 268 bytes with 4K pages (since the frags[] array needs 18 entries). + * + * Without jumbo packets, the maximum packet size will be 1536 bytes, + * and we use 2 bytes (NET_IP_ALIGN) of padding. ISSUE: If we told + * the hardware to clip at 1518 bytes instead of 1536 bytes, then we + * could save an entire cache line, but in practice, we don't need it. + * + * Since CPAs are 38 bits, and we can only encode the high 31 bits in + * a "linux_buffer_t", the low 7 bits must be zero, and thus, we must + * align the actual "va" mod 128. + * + * We assume that the underlying "head" will be aligned mod 64. Note + * that in practice, we have seen "head" NOT aligned mod 128 even when + * using 2048 byte allocations, which is surprising. + * + * If "head" WAS always aligned mod 128, we could change LIPP to + * assume that the low SIX bits are zero, and the 7th bit is one, that + * is, align the actual "va" mod 128 plus 64, which would be "free". + * + * For now, the actual "head" pointer points at NET_SKB_PAD bytes of + * padding, plus 28 or 92 bytes of extra padding, plus the sk_buff + * pointer, plus the NET_IP_ALIGN padding, plus 126 or 1536 bytes for + * the actual packet, plus 62 bytes of empty padding, plus some + * padding and the "struct skb_shared_info". + * + * With 64K pages, a large buffer thus needs 32+92+4+2+1536+62+88 + * bytes, or 1816 bytes, which fits comfortably into 2048 bytes. + * + * With 64K pages, a small buffer thus needs 32+92+4+2+126+88 + * bytes, or 344 bytes, which means we are wasting 64+ bytes, and + * could presumably increase the size of small buffers. + * + * With 4K pages, a large buffer thus needs 32+92+4+2+1536+62+268 + * bytes, or 1996 bytes, which fits comfortably into 2048 bytes. + * + * With 4K pages, a small buffer thus needs 32+92+4+2+126+268 + * bytes, or 524 bytes, which is annoyingly wasteful. + * + * Maybe we should increase LIPP_SMALL_PACKET_SIZE to 192? + * + * ISSUE: Maybe we should increase "NET_SKB_PAD" to 64? */ static bool tile_net_provide_needed_buffer(struct tile_net_cpu *info, bool small) { - /* ISSUE: What should we use here? */ +#if TILE_NET_MTU <= 1536 + /* Without "jumbo", 2 + 1536 should be sufficient. */ + unsigned int large_size = NET_IP_ALIGN + 1536; +#else + /* ISSUE: This has not been tested. */ unsigned int large_size = NET_IP_ALIGN + TILE_NET_MTU + 100; +#endif - /* Round up to ensure to avoid "false sharing" with last cache line. */ - unsigned int buffer_size = + /* Avoid "false sharing" with last cache line. */ + /* ISSUE: This is already done by "dev_alloc_skb()". */ + unsigned int len = (((small ? LIPP_SMALL_PACKET_SIZE : large_size) + CHIP_L2_LINE_SIZE() - 1) & -CHIP_L2_LINE_SIZE()); - /* - * ISSUE: Since CPAs are 38 bits, and we can only encode the - * high 31 bits in a "linux_buffer_t", the low 7 bits must be - * zero, and thus, we must align the actual "va" mod 128. - */ - const unsigned long align = 128; + unsigned int padding = 128 - NET_SKB_PAD; + unsigned int align; struct sk_buff *skb; void *va; struct sk_buff **skb_ptr; - /* Note that "dev_alloc_skb()" adds NET_SKB_PAD more bytes, */ - /* and also "reserves" that many bytes. */ - /* ISSUE: Can we "share" the NET_SKB_PAD bytes with "skb_ptr"? */ - int len = sizeof(*skb_ptr) + align + buffer_size; - - while (1) { - - /* Allocate (or fail). */ - skb = dev_alloc_skb(len); - if (skb == NULL) - return false; - - /* Make room for a back-pointer to 'skb'. */ - skb_reserve(skb, sizeof(*skb_ptr)); + /* Request 96 extra bytes for alignment purposes. */ + skb = dev_alloc_skb(len + padding); + if (skb == NULL) + return false; - /* Make sure we are aligned. */ - skb_reserve(skb, -(long)skb->data & (align - 1)); + /* Skip 32 or 96 bytes to align "data" mod 128. */ + align = -(long)skb->data & (128 - 1); + BUG_ON(align > padding); + skb_reserve(skb, align); - /* This address is given to IPP. */ - va = skb->data; + /* This address is given to IPP. */ + va = skb->data; - if (small) - break; + /* Buffers must not span a huge page. */ + BUG_ON(((((long)va & ~HPAGE_MASK) + len) & HPAGE_MASK) != 0); - /* ISSUE: This has never been observed! */ - /* Large buffers must not span a huge page. */ - if (((((long)va & ~HPAGE_MASK) + 1535) & HPAGE_MASK) == 0) - break; - pr_err("Leaking unaligned linux buffer at %p.\n", va); +#ifdef TILE_NET_PARANOIA +#if CHIP_HAS_CBOX_HOME_MAP() + if (hash_default) { + HV_PTE pte = *virt_to_pte(current->mm, (unsigned long)va); + if (hv_pte_get_mode(pte) != HV_PTE_MODE_CACHE_HASH_L3) + panic("Non-HFH ingress buffer! VA=%p Mode=%d PTE=%llx", + va, hv_pte_get_mode(pte), hv_pte_val(pte)); } +#endif +#endif + + /* Invalidate the packet buffer. */ + if (!hash_default) + __inv_buffer(va, len); /* Skip two bytes to satisfy LIPP assumptions. */ /* Note that this aligns IP on a 16 byte boundary. */ @@ -415,23 +472,9 @@ static bool tile_net_provide_needed_buffer(struct tile_net_cpu *info, skb_ptr = va - sizeof(*skb_ptr); *skb_ptr = skb; - /* Invalidate the packet buffer. */ - if (!hash_default) - __inv_buffer(skb->data, buffer_size); - /* Make sure "skb_ptr" has been flushed. */ __insn_mf(); -#ifdef TILE_NET_PARANOIA -#if CHIP_HAS_CBOX_HOME_MAP() - if (hash_default) { - HV_PTE pte = *virt_to_pte(current->mm, (unsigned long)va); - if (hv_pte_get_mode(pte) != HV_PTE_MODE_CACHE_HASH_L3) - panic("Non-coherent ingress buffer!"); - } -#endif -#endif - /* Provide the new buffer. */ tile_net_provide_linux_buffer(info, va, small); @@ -469,48 +512,64 @@ oops: * Grab some LEPP completions, and store them in "comps", of size * "comps_size", and return the number of completions which were * stored, so the caller can free them. - * - * If "pending" is not NULL, it will be set to true if there might - * still be some pending completions caused by this tile, else false. */ -static unsigned int tile_net_lepp_grab_comps(struct net_device *dev, +static unsigned int tile_net_lepp_grab_comps(lepp_queue_t *eq, struct sk_buff *comps[], unsigned int comps_size, - bool *pending) + unsigned int min_size) { - struct tile_net_priv *priv = netdev_priv(dev); - - lepp_queue_t *eq = priv->epp_queue; - unsigned int n = 0; - unsigned int comp_head; - unsigned int comp_busy; - unsigned int comp_tail; - - spin_lock(&priv->comp_lock); - - comp_head = eq->comp_head; - comp_busy = eq->comp_busy; - comp_tail = eq->comp_tail; + unsigned int comp_head = eq->comp_head; + unsigned int comp_busy = eq->comp_busy; while (comp_head != comp_busy && n < comps_size) { comps[n++] = eq->comps[comp_head]; LEPP_QINC(comp_head); } - if (pending != NULL) - *pending = (comp_head != comp_tail); + if (n < min_size) + return 0; eq->comp_head = comp_head; - spin_unlock(&priv->comp_lock); - return n; } /* + * Free some comps, and return true iff there are still some pending. + */ +static bool tile_net_lepp_free_comps(struct net_device *dev, bool all) +{ + struct tile_net_priv *priv = netdev_priv(dev); + + lepp_queue_t *eq = priv->eq; + + struct sk_buff *olds[64]; + unsigned int wanted = 64; + unsigned int i, n; + bool pending; + + spin_lock(&priv->eq_lock); + + if (all) + eq->comp_busy = eq->comp_tail; + + n = tile_net_lepp_grab_comps(eq, olds, wanted, 0); + + pending = (eq->comp_head != eq->comp_tail); + + spin_unlock(&priv->eq_lock); + + for (i = 0; i < n; i++) + kfree_skb(olds[i]); + + return pending; +} + + +/* * Make sure the egress timer is scheduled. * * Note that we use "schedule if not scheduled" logic instead of the more @@ -544,21 +603,11 @@ static void tile_net_handle_egress_timer(unsigned long arg) struct tile_net_cpu *info = (struct tile_net_cpu *)arg; struct net_device *dev = info->napi.dev; - struct sk_buff *olds[32]; - unsigned int wanted = 32; - unsigned int i, nolds = 0; - bool pending; - /* The timer is no longer scheduled. */ info->egress_timer_scheduled = false; - nolds = tile_net_lepp_grab_comps(dev, olds, wanted, &pending); - - for (i = 0; i < nolds; i++) - kfree_skb(olds[i]); - - /* Reschedule timer if needed. */ - if (pending) + /* Free comps, and reschedule timer if more are pending. */ + if (tile_net_lepp_free_comps(dev, false)) tile_net_schedule_egress_timer(info); } @@ -636,8 +685,39 @@ static bool is_dup_ack(char *s1, char *s2, unsigned int len) +static void tile_net_discard_aux(struct tile_net_cpu *info, int index) +{ + struct tile_netio_queue *queue = &info->queue; + netio_queue_impl_t *qsp = queue->__system_part; + netio_queue_user_impl_t *qup = &queue->__user_part; + + int index2_aux = index + sizeof(netio_pkt_t); + int index2 = + ((index2_aux == + qsp->__packet_receive_queue.__last_packet_plus_one) ? + 0 : index2_aux); + + netio_pkt_t *pkt = (netio_pkt_t *)((unsigned long) &qsp[1] + index); + + /* Extract the "linux_buffer_t". */ + unsigned int buffer = pkt->__packet.word; + + /* Convert "linux_buffer_t" to "va". */ + void *va = __va((phys_addr_t)(buffer >> 1) << 7); + + /* Acquire the associated "skb". */ + struct sk_buff **skb_ptr = va - sizeof(*skb_ptr); + struct sk_buff *skb = *skb_ptr; + + kfree_skb(skb); + + /* Consume this packet. */ + qup->__packet_receive_read = index2; +} + + /* - * Like "tile_net_handle_packets()", but just discard packets. + * Like "tile_net_poll()", but just discard packets. */ static void tile_net_discard_packets(struct net_device *dev) { @@ -650,32 +730,8 @@ static void tile_net_discard_packets(struct net_device *dev) while (qup->__packet_receive_read != qsp->__packet_receive_queue.__packet_write) { - int index = qup->__packet_receive_read; - - int index2_aux = index + sizeof(netio_pkt_t); - int index2 = - ((index2_aux == - qsp->__packet_receive_queue.__last_packet_plus_one) ? - 0 : index2_aux); - - netio_pkt_t *pkt = (netio_pkt_t *) - ((unsigned long) &qsp[1] + index); - - /* Extract the "linux_buffer_t". */ - unsigned int buffer = pkt->__packet.word; - - /* Convert "linux_buffer_t" to "va". */ - void *va = __va((phys_addr_t)(buffer >> 1) << 7); - - /* Acquire the associated "skb". */ - struct sk_buff **skb_ptr = va - sizeof(*skb_ptr); - struct sk_buff *skb = *skb_ptr; - - kfree_skb(skb); - - /* Consume this packet. */ - qup->__packet_receive_read = index2; + tile_net_discard_aux(info, index); } } @@ -704,7 +760,8 @@ static bool tile_net_poll_aux(struct tile_net_cpu *info, int index) netio_pkt_metadata_t *metadata = NETIO_PKT_METADATA(pkt); - /* Extract the packet size. */ + /* Extract the packet size. FIXME: Shouldn't the second line */ + /* get subtracted? Mostly moot, since it should be "zero". */ unsigned long len = (NETIO_PKT_CUSTOM_LENGTH(pkt) + NET_IP_ALIGN - NETIO_PACKET_PADDING); @@ -722,15 +779,6 @@ static bool tile_net_poll_aux(struct tile_net_cpu *info, int index) /* Compare to "NETIO_PKT_CUSTOM_DATA(pkt)". */ unsigned char *buf = va + NET_IP_ALIGN; -#ifdef IGNORE_DUP_ACKS - - static int other; - static int final; - static int keep; - static int skip; - -#endif - /* Invalidate the packet buffer. */ if (!hash_default) __inv_buffer(buf, len); @@ -745,16 +793,8 @@ static bool tile_net_poll_aux(struct tile_net_cpu *info, int index) #ifdef TILE_NET_VERIFY_INGRESS if (!NETIO_PKT_L4_CSUM_CORRECT_M(metadata, pkt) && NETIO_PKT_L4_CSUM_CALCULATED_M(metadata, pkt)) { - /* - * FIXME: This complains about UDP packets - * with a "zero" checksum (bug 6624). - */ -#ifdef TILE_NET_PANIC_ON_BAD - dump_packet(buf, len, "rx"); - panic("Bad L4 checksum."); -#else + /* Bug 6624: Includes UDP packets with a "zero" checksum. */ pr_warning("Bad L4 checksum on %d byte packet.\n", len); -#endif } if (!NETIO_PKT_L3_CSUM_CORRECT_M(metadata, pkt) && NETIO_PKT_L3_CSUM_CALCULATED_M(metadata, pkt)) { @@ -769,90 +809,29 @@ static bool tile_net_poll_aux(struct tile_net_cpu *info, int index) } break; case NETIO_PKT_STATUS_BAD: -#ifdef TILE_NET_PANIC_ON_BAD - dump_packet(buf, len, "rx"); - panic("Unexpected BAD packet."); -#else - pr_warning("Unexpected BAD %d byte packet.\n", len); -#endif + pr_warning("Unexpected BAD %ld byte packet.\n", len); } #endif filter = 0; + /* ISSUE: Filter TCP packets with "bad" checksums? */ + if (!(dev->flags & IFF_UP)) { /* Filter packets received before we're up. */ filter = 1; + } else if (NETIO_PKT_STATUS_M(metadata, pkt) == NETIO_PKT_STATUS_BAD) { + /* Filter "truncated" packets. */ + filter = 1; } else if (!(dev->flags & IFF_PROMISC)) { - /* - * FIXME: Implement HW multicast filter. - */ - if (is_unicast_ether_addr(buf)) { + /* FIXME: Implement HW multicast filter. */ + if (!is_multicast_ether_addr(buf)) { /* Filter packets not for our address. */ const u8 *mine = dev->dev_addr; filter = compare_ether_addr(mine, buf); } } -#ifdef IGNORE_DUP_ACKS - - if (len != 66) { - /* FIXME: Must check "is_tcp_ack(buf, len)" somehow. */ - - other++; - - } else if (index2 == - qsp->__packet_receive_queue.__packet_write) { - - final++; - - } else { - - netio_pkt_t *pkt2 = (netio_pkt_t *) - ((unsigned long) &qsp[1] + index2); - - netio_pkt_metadata_t *metadata2 = - NETIO_PKT_METADATA(pkt2); - - /* Extract the packet size. */ - unsigned long len2 = - (NETIO_PKT_CUSTOM_LENGTH(pkt2) + - NET_IP_ALIGN - NETIO_PACKET_PADDING); - - if (len2 == 66 && - NETIO_PKT_FLOW_HASH_M(metadata, pkt) == - NETIO_PKT_FLOW_HASH_M(metadata2, pkt2)) { - - /* Extract the "linux_buffer_t". */ - unsigned int buffer2 = pkt2->__packet.word; - - /* Convert "linux_buffer_t" to "va". */ - void *va2 = - __va((phys_addr_t)(buffer2 >> 1) << 7); - - /* Extract the packet data pointer. */ - /* Compare to "NETIO_PKT_CUSTOM_DATA(pkt)". */ - unsigned char *buf2 = va2 + NET_IP_ALIGN; - - /* Invalidate the packet buffer. */ - if (!hash_default) - __inv_buffer(buf2, len2); - - if (is_dup_ack(buf, buf2, len)) { - skip++; - filter = 1; - } else { - keep++; - } - } - } - - if (net_ratelimit()) - pr_info("Other %d Final %d Keep %d Skip %d.\n", - other, final, keep, skip); - -#endif - if (filter) { /* ISSUE: Update "drop" statistics? */ @@ -877,10 +856,7 @@ static bool tile_net_poll_aux(struct tile_net_cpu *info, int index) /* NOTE: This call also sets "skb->dev = dev". */ skb->protocol = eth_type_trans(skb, dev); - /* ISSUE: Discard corrupt packets? */ - /* ISSUE: Discard packets with bad checksums? */ - - /* Avoid recomputing TCP/UDP checksums. */ + /* Avoid recomputing "good" TCP/UDP checksums. */ if (NETIO_PKT_L4_CSUM_CORRECT_M(metadata, pkt)) skb->ip_summed = CHECKSUM_UNNECESSARY; @@ -912,9 +888,14 @@ static bool tile_net_poll_aux(struct tile_net_cpu *info, int index) /* * Handle some packets for the given device on the current CPU. * - * ISSUE: The "rotting packet" race condition occurs if a packet - * arrives after the queue appears to be empty, and before the - * hypervisor interrupt is re-enabled. + * If "tile_net_stop()" is called on some other tile while this + * function is running, we will return, hopefully before that + * other tile asks us to call "napi_disable()". + * + * The "rotting packet" race condition occurs if a packet arrives + * during the extremely narrow window between the queue appearing to + * be empty, and the ingress interrupt being re-enabled. This happens + * a LOT under heavy network load. */ static int tile_net_poll(struct napi_struct *napi, int budget) { @@ -928,7 +909,7 @@ static int tile_net_poll(struct napi_struct *napi, int budget) unsigned int work = 0; - while (1) { + while (priv->active) { int index = qup->__packet_receive_read; if (index == qsp->__packet_receive_queue.__packet_write) break; @@ -941,19 +922,24 @@ static int tile_net_poll(struct napi_struct *napi, int budget) napi_complete(&info->napi); - /* Re-enable hypervisor interrupts. */ + if (!priv->active) + goto done; + + /* Re-enable the ingress interrupt. */ enable_percpu_irq(priv->intr_id); - /* HACK: Avoid the "rotting packet" problem. */ + /* HACK: Avoid the "rotting packet" problem (see above). */ if (qup->__packet_receive_read != - qsp->__packet_receive_queue.__packet_write) - napi_schedule(&info->napi); - - /* ISSUE: Handle completions? */ + qsp->__packet_receive_queue.__packet_write) { + /* ISSUE: Sometimes this returns zero, presumably */ + /* because an interrupt was handled for this tile. */ + (void)napi_reschedule(&info->napi); + } done: - tile_net_provide_needed_buffers(info); + if (priv->active) + tile_net_provide_needed_buffers(info); return work; } @@ -961,6 +947,12 @@ done: /* * Handle an ingress interrupt for the given device on the current cpu. + * + * ISSUE: Sometimes this gets called after "disable_percpu_irq()" has + * been called! This is probably due to "pending hypervisor downcalls". + * + * ISSUE: Is there any race condition between the "napi_schedule()" here + * and the "napi_complete()" call above? */ static irqreturn_t tile_net_handle_ingress_interrupt(int irq, void *dev_ptr) { @@ -969,9 +961,15 @@ static irqreturn_t tile_net_handle_ingress_interrupt(int irq, void *dev_ptr) int my_cpu = smp_processor_id(); struct tile_net_cpu *info = priv->cpu[my_cpu]; - /* Disable hypervisor interrupt. */ + /* Disable the ingress interrupt. */ disable_percpu_irq(priv->intr_id); + /* Ignore unwanted interrupts. */ + if (!priv->active) + return IRQ_HANDLED; + + /* ISSUE: Sometimes "info->napi_enabled" is false here. */ + napi_schedule(&info->napi); return IRQ_HANDLED; @@ -1005,8 +1003,7 @@ static int tile_net_open_aux(struct net_device *dev) */ { int epp_home = hv_lotar_to_cpu(epp_lotar); - struct page *page = virt_to_page(priv->epp_queue); - homecache_change_page_home(page, 0, epp_home); + homecache_change_page_home(priv->eq_pages, EQ_ORDER, epp_home); } /* @@ -1015,9 +1012,9 @@ static int tile_net_open_aux(struct net_device *dev) { netio_ipp_address_t ea = { .va = 0, - .pa = __pa(priv->epp_queue), + .pa = __pa(priv->eq), .pte = hv_pte(0), - .size = PAGE_SIZE, + .size = EQ_SIZE, }; ea.pte = hv_pte_set_lotar(ea.pte, epp_lotar); ea.pte = hv_pte_set_mode(ea.pte, HV_PTE_MODE_CACHE_TILE_L3); @@ -1043,7 +1040,7 @@ static int tile_net_open_aux(struct net_device *dev) /* - * Register with hypervisor on each CPU. + * Register with hypervisor on the current CPU. * * Strangely, this function does important things even if it "fails", * which is especially common if the link is not up yet. Hopefully @@ -1092,7 +1089,8 @@ static void tile_net_register(void *dev_ptr) priv->cpu[my_cpu] = info; /* - * Register ourselves with the IPP. + * Register ourselves with LIPP. This does a lot of stuff, + * including invoking the LIPP registration code. */ ret = hv_dev_pwrite(priv->hv_devhdl, 0, (HV_VirtAddr)&config, @@ -1101,8 +1099,11 @@ static void tile_net_register(void *dev_ptr) PDEBUG("hv_dev_pwrite(NETIO_IPP_INPUT_REGISTER_OFF) returned %d\n", ret); if (ret < 0) { - printk(KERN_DEBUG "hv_dev_pwrite NETIO_IPP_INPUT_REGISTER_OFF" - " failure %d\n", ret); + if (ret != NETIO_LINK_DOWN) { + printk(KERN_DEBUG "hv_dev_pwrite " + "NETIO_IPP_INPUT_REGISTER_OFF failure %d\n", + ret); + } info->link_down = (ret == NETIO_LINK_DOWN); return; } @@ -1145,15 +1146,47 @@ static void tile_net_register(void *dev_ptr) NETIO_IPP_GET_FASTIO_OFF); PDEBUG("hv_dev_pread(NETIO_IPP_GET_FASTIO_OFF) returned %d\n", ret); - netif_napi_add(dev, &info->napi, tile_net_poll, 64); - /* Now we are registered. */ info->registered = true; } /* - * Unregister with hypervisor on each CPU. + * Deregister with hypervisor on the current CPU. + * + * This simply discards all our credits, so no more packets will be + * delivered to this tile. There may still be packets in our queue. + * + * Also, disable the ingress interrupt. + */ +static void tile_net_deregister(void *dev_ptr) +{ + struct net_device *dev = (struct net_device *)dev_ptr; + struct tile_net_priv *priv = netdev_priv(dev); + int my_cpu = smp_processor_id(); + struct tile_net_cpu *info = priv->cpu[my_cpu]; + + /* Disable the ingress interrupt. */ + disable_percpu_irq(priv->intr_id); + + /* Do nothing else if not registered. */ + if (info == NULL || !info->registered) + return; + + { + struct tile_netio_queue *queue = &info->queue; + netio_queue_user_impl_t *qup = &queue->__user_part; + + /* Discard all our credits. */ + __netio_fastio_return_credits(qup->__fastio_index, -1); + } +} + + +/* + * Unregister with hypervisor on the current CPU. + * + * Also, disable the ingress interrupt. */ static void tile_net_unregister(void *dev_ptr) { @@ -1162,35 +1195,23 @@ static void tile_net_unregister(void *dev_ptr) int my_cpu = smp_processor_id(); struct tile_net_cpu *info = priv->cpu[my_cpu]; - int ret = 0; + int ret; int dummy = 0; - /* Do nothing if never registered. */ - if (info == NULL) - return; + /* Disable the ingress interrupt. */ + disable_percpu_irq(priv->intr_id); - /* Do nothing if already unregistered. */ - if (!info->registered) + /* Do nothing else if not registered. */ + if (info == NULL || !info->registered) return; - /* - * Unregister ourselves with LIPP. - */ + /* Unregister ourselves with LIPP/LEPP. */ ret = hv_dev_pwrite(priv->hv_devhdl, 0, (HV_VirtAddr)&dummy, sizeof(dummy), NETIO_IPP_INPUT_UNREGISTER_OFF); - PDEBUG("hv_dev_pwrite(NETIO_IPP_INPUT_UNREGISTER_OFF) returned %d\n", - ret); - if (ret < 0) { - /* FIXME: Just panic? */ - pr_err("hv_dev_pwrite NETIO_IPP_INPUT_UNREGISTER_OFF" - " failure %d\n", ret); - } + if (ret < 0) + panic("Failed to unregister with LIPP/LEPP!\n"); - /* - * Discard all packets still in our NetIO queue. Hopefully, - * once the unregister call is complete, there will be no - * packets still in flight on the IDN. - */ + /* Discard all packets still in our NetIO queue. */ tile_net_discard_packets(dev); /* Reset state. */ @@ -1200,11 +1221,6 @@ static void tile_net_unregister(void *dev_ptr) /* Cancel egress timer. */ del_timer(&info->egress_timer); info->egress_timer_scheduled = false; - - netif_napi_del(&info->napi); - - /* Now we are unregistered. */ - info->registered = false; } @@ -1212,18 +1228,28 @@ static void tile_net_unregister(void *dev_ptr) * Helper function for "tile_net_stop()". * * Also used to handle registration failure in "tile_net_open_inner()", - * when "fully_opened" is known to be false, and the various extra - * steps in "tile_net_stop()" are not necessary. ISSUE: It might be - * simpler if we could just call "tile_net_stop()" anyway. + * when the various extra steps in "tile_net_stop()" are not necessary. */ static void tile_net_stop_aux(struct net_device *dev) { struct tile_net_priv *priv = netdev_priv(dev); + int i; int dummy = 0; - /* Unregister all tiles, so LIPP will stop delivering packets. */ + /* + * Unregister all tiles, so LIPP will stop delivering packets. + * Also, delete all the "napi" objects (sequentially, to protect + * "dev->napi_list"). + */ on_each_cpu(tile_net_unregister, (void *)dev, 1); + for_each_online_cpu(i) { + struct tile_net_cpu *info = priv->cpu[i]; + if (info != NULL && info->registered) { + netif_napi_del(&info->napi); + info->registered = false; + } + } /* Stop LIPP/LEPP. */ if (hv_dev_pwrite(priv->hv_devhdl, 0, (HV_VirtAddr)&dummy, @@ -1235,18 +1261,15 @@ static void tile_net_stop_aux(struct net_device *dev) /* - * Disable ingress interrupts for the given device on the current cpu. + * Disable NAPI for the given device on the current cpu. */ -static void tile_net_disable_intr(void *dev_ptr) +static void tile_net_stop_disable(void *dev_ptr) { struct net_device *dev = (struct net_device *)dev_ptr; struct tile_net_priv *priv = netdev_priv(dev); int my_cpu = smp_processor_id(); struct tile_net_cpu *info = priv->cpu[my_cpu]; - /* Disable hypervisor interrupt. */ - disable_percpu_irq(priv->intr_id); - /* Disable NAPI if needed. */ if (info != NULL && info->napi_enabled) { napi_disable(&info->napi); @@ -1256,21 +1279,24 @@ static void tile_net_disable_intr(void *dev_ptr) /* - * Enable ingress interrupts for the given device on the current cpu. + * Enable NAPI and the ingress interrupt for the given device + * on the current cpu. + * + * ISSUE: Only do this for "network cpus"? */ -static void tile_net_enable_intr(void *dev_ptr) +static void tile_net_open_enable(void *dev_ptr) { struct net_device *dev = (struct net_device *)dev_ptr; struct tile_net_priv *priv = netdev_priv(dev); int my_cpu = smp_processor_id(); struct tile_net_cpu *info = priv->cpu[my_cpu]; - /* Enable hypervisor interrupt. */ - enable_percpu_irq(priv->intr_id); - /* Enable NAPI. */ napi_enable(&info->napi); info->napi_enabled = true; + + /* Enable the ingress interrupt. */ + enable_percpu_irq(priv->intr_id); } @@ -1288,8 +1314,9 @@ static int tile_net_open_inner(struct net_device *dev) int my_cpu = smp_processor_id(); struct tile_net_cpu *info; struct tile_netio_queue *queue; - unsigned int irq; + int result = 0; int i; + int dummy = 0; /* * First try to register just on the local CPU, and handle any @@ -1307,42 +1334,52 @@ static int tile_net_open_inner(struct net_device *dev) /* * Now register everywhere else. If any registration fails, * even for "link down" (which might not be possible), we - * clean up using "tile_net_stop_aux()". + * clean up using "tile_net_stop_aux()". Also, add all the + * "napi" objects (sequentially, to protect "dev->napi_list"). + * ISSUE: Only use "netif_napi_add()" for "network cpus"? */ smp_call_function(tile_net_register, (void *)dev, 1); for_each_online_cpu(i) { - if (!priv->cpu[i]->registered) { - tile_net_stop_aux(dev); - return -EAGAIN; - } + struct tile_net_cpu *info = priv->cpu[i]; + if (info->registered) + netif_napi_add(dev, &info->napi, tile_net_poll, 64); + else + result = -EAGAIN; + } + if (result != 0) { + tile_net_stop_aux(dev); + return result; } queue = &info->queue; - /* - * Set the device intr bit mask. - * The tile_net_register above sets per tile __intr_id. - */ - priv->intr_id = queue->__system_part->__intr_id; - BUG_ON(!priv->intr_id); - - /* - * Register the device interrupt handler. - * The __ffs() function returns the index into the interrupt handler - * table from the interrupt bit mask which should have one bit - * and one bit only set. - */ - irq = __ffs(priv->intr_id); - tile_irq_activate(irq, TILE_IRQ_PERCPU); - BUG_ON(request_irq(irq, tile_net_handle_ingress_interrupt, - 0, dev->name, (void *)dev) != 0); + if (priv->intr_id == 0) { + unsigned int irq; - /* ISSUE: How could "priv->fully_opened" ever be "true" here? */ - - if (!priv->fully_opened) { + /* + * Acquire the irq allocated by the hypervisor. Every + * queue gets the same irq. The "__intr_id" field is + * "1 << irq", so we use "__ffs()" to extract "irq". + */ + priv->intr_id = queue->__system_part->__intr_id; + BUG_ON(priv->intr_id == 0); + irq = __ffs(priv->intr_id); - int dummy = 0; + /* + * Register the ingress interrupt handler for this + * device, permanently. + * + * We used to call "free_irq()" in "tile_net_stop()", + * and then re-register the handler here every time, + * but that caused DNP errors in "handle_IRQ_event()" + * because "desc->action" was NULL. See bug 9143. + */ + tile_irq_activate(irq, TILE_IRQ_PERCPU); + BUG_ON(request_irq(irq, tile_net_handle_ingress_interrupt, + 0, dev->name, (void *)dev) != 0); + } + { /* Allocate initial buffers. */ int max_buffers = @@ -1359,18 +1396,21 @@ static int tile_net_open_inner(struct net_device *dev) if (info->num_needed_small_buffers != 0 || info->num_needed_large_buffers != 0) panic("Insufficient memory for buffer stack!"); + } - /* Start LIPP/LEPP and activate "ingress" at the shim. */ - if (hv_dev_pwrite(priv->hv_devhdl, 0, (HV_VirtAddr)&dummy, - sizeof(dummy), NETIO_IPP_INPUT_INIT_OFF) < 0) - panic("Failed to activate the LIPP Shim!\n"); + /* We are about to be active. */ + priv->active = true; - priv->fully_opened = 1; - } + /* Make sure "active" is visible to all tiles. */ + mb(); - /* On each tile, enable the hypervisor to trigger interrupts. */ - /* ISSUE: Do this before starting LIPP/LEPP? */ - on_each_cpu(tile_net_enable_intr, (void *)dev, 1); + /* On each tile, enable NAPI and the ingress interrupt. */ + on_each_cpu(tile_net_open_enable, (void *)dev, 1); + + /* Start LIPP/LEPP and activate "ingress" at the shim. */ + if (hv_dev_pwrite(priv->hv_devhdl, 0, (HV_VirtAddr)&dummy, + sizeof(dummy), NETIO_IPP_INPUT_INIT_OFF) < 0) + panic("Failed to activate the LIPP Shim!\n"); /* Start our transmit queue. */ netif_start_queue(dev); @@ -1396,9 +1436,9 @@ static void tile_net_open_retry(struct work_struct *w) * ourselves to try again later; otherwise, tell Linux we now have * a working link. ISSUE: What if the return value is negative? */ - if (tile_net_open_inner(priv->dev)) - schedule_delayed_work_on(singlethread_cpu, &priv->retry_work, - TILE_NET_RETRY_INTERVAL); + if (tile_net_open_inner(priv->dev) != 0) + schedule_delayed_work(&priv->retry_work, + TILE_NET_RETRY_INTERVAL); else netif_carrier_on(priv->dev); } @@ -1412,8 +1452,8 @@ static void tile_net_open_retry(struct work_struct *w) * The open entry point is called when a network interface is made * active by the system (IFF_UP). At this point all resources needed * for transmit and receive operations are allocated, the interrupt - * handler is registered with the OS, the watchdog timer is started, - * and the stack is notified that the interface is ready. + * handler is registered with the OS (if needed), the watchdog timer + * is started, and the stack is notified that the interface is ready. * * If the actual link is not available yet, then we tell Linux that * we have no carrier, and we keep checking until the link comes up. @@ -1468,6 +1508,10 @@ static int tile_net_open(struct net_device *dev) #endif priv->partly_opened = 1; + + } else { + /* FIXME: Is this possible? */ + /* printk("Already partly opened.\n"); */ } /* @@ -1487,57 +1531,17 @@ static int tile_net_open(struct net_device *dev) * and then remember to try again later. */ netif_carrier_off(dev); - schedule_delayed_work_on(singlethread_cpu, &priv->retry_work, - TILE_NET_RETRY_INTERVAL); + schedule_delayed_work(&priv->retry_work, TILE_NET_RETRY_INTERVAL); return 0; } -/* - * Disables a network interface. - * - * Returns 0, this is not allowed to fail. - * - * The close entry point is called when an interface is de-activated - * by the OS. The hardware is still under the drivers control, but - * needs to be disabled. A global MAC reset is issued to stop the - * hardware, and all transmit and receive resources are freed. - * - * ISSUE: Can this can be called while "tile_net_poll()" is running? - */ -static int tile_net_stop(struct net_device *dev) +static int tile_net_drain_lipp_buffers(struct tile_net_priv *priv) { - struct tile_net_priv *priv = netdev_priv(dev); - - bool pending = true; - - PDEBUG("tile_net_stop()\n"); - - /* ISSUE: Only needed if not yet fully open. */ - cancel_delayed_work_sync(&priv->retry_work); - - /* Can't transmit any more. */ - netif_stop_queue(dev); - - /* - * Disable hypervisor interrupts on each tile. - */ - on_each_cpu(tile_net_disable_intr, (void *)dev, 1); - - /* - * Unregister the interrupt handler. - * The __ffs() function returns the index into the interrupt handler - * table from the interrupt bit mask which should have one bit - * and one bit only set. - */ - if (priv->intr_id) - free_irq(__ffs(priv->intr_id), dev); - - /* - * Drain all the LIPP buffers. - */ + int n = 0; + /* Drain all the LIPP buffers. */ while (true) { int buffer; @@ -1560,43 +1564,105 @@ static int tile_net_stop(struct net_device *dev) kfree_skb(skb); } + + n++; } - /* Stop LIPP/LEPP. */ - tile_net_stop_aux(dev); + return n; +} - priv->fully_opened = 0; +/* + * Disables a network interface. + * + * Returns 0, this is not allowed to fail. + * + * The close entry point is called when an interface is de-activated + * by the OS. The hardware is still under the drivers control, but + * needs to be disabled. A global MAC reset is issued to stop the + * hardware, and all transmit and receive resources are freed. + * + * ISSUE: How closely does "netif_running(dev)" mirror "priv->active"? + * + * Before we are called by "__dev_close()", "netif_running()" will + * have been cleared, so no NEW calls to "tile_net_poll()" will be + * made by "netpoll_poll_dev()". + * + * Often, this can cause some tiles to still have packets in their + * queues, so we must call "tile_net_discard_packets()" later. + * + * Note that some other tile may still be INSIDE "tile_net_poll()", + * and in fact, many will be, if there is heavy network load. + * + * Calling "on_each_cpu(tile_net_stop_disable, (void *)dev, 1)" when + * any tile is still "napi_schedule()"'d will induce a horrible crash + * when "msleep()" is called. This includes tiles which are inside + * "tile_net_poll()" which have not yet called "napi_complete()". + * + * So, we must first try to wait long enough for other tiles to finish + * with any current "tile_net_poll()" call, and, hopefully, to clear + * the "scheduled" flag. ISSUE: It is unclear what happens to tiles + * which have called "napi_schedule()" but which had not yet tried to + * call "tile_net_poll()", or which exhausted their budget inside + * "tile_net_poll()" just before this function was called. + */ +static int tile_net_stop(struct net_device *dev) +{ + struct tile_net_priv *priv = netdev_priv(dev); + + PDEBUG("tile_net_stop()\n"); + /* Start discarding packets. */ + priv->active = false; + + /* Make sure "active" is visible to all tiles. */ + mb(); /* - * XXX: ISSUE: It appears that, in practice anyway, by the - * time we get here, there are no pending completions. + * On each tile, make sure no NEW packets get delivered, and + * disable the ingress interrupt. + * + * Note that the ingress interrupt can fire AFTER this, + * presumably due to packets which were recently delivered, + * but it will have no effect. */ - while (pending) { + on_each_cpu(tile_net_deregister, (void *)dev, 1); - struct sk_buff *olds[32]; - unsigned int wanted = 32; - unsigned int i, nolds = 0; + /* Optimistically drain LIPP buffers. */ + (void)tile_net_drain_lipp_buffers(priv); - nolds = tile_net_lepp_grab_comps(dev, olds, - wanted, &pending); + /* ISSUE: Only needed if not yet fully open. */ + cancel_delayed_work_sync(&priv->retry_work); - /* ISSUE: We have never actually seen this debug spew. */ - if (nolds != 0) - pr_info("During tile_net_stop(), grabbed %d comps.\n", - nolds); + /* Can't transmit any more. */ + netif_stop_queue(dev); - for (i = 0; i < nolds; i++) - kfree_skb(olds[i]); - } + /* Disable NAPI on each tile. */ + on_each_cpu(tile_net_stop_disable, (void *)dev, 1); + + /* + * Drain any remaining LIPP buffers. NOTE: This "printk()" + * has never been observed, but in theory it could happen. + */ + if (tile_net_drain_lipp_buffers(priv) != 0) + printk("Had to drain some extra LIPP buffers!\n"); + /* Stop LIPP/LEPP. */ + tile_net_stop_aux(dev); + + /* + * ISSUE: It appears that, in practice anyway, by the time we + * get here, there are no pending completions, but just in case, + * we free (all of) them anyway. + */ + while (tile_net_lepp_free_comps(dev, true)) + /* loop */; /* Wipe the EPP queue. */ - memset(priv->epp_queue, 0, sizeof(lepp_queue_t)); + memset(priv->eq, 0, sizeof(lepp_queue_t)); /* Evict the EPP queue. */ - finv_buffer(priv->epp_queue, PAGE_SIZE); + finv_buffer(priv->eq, EQ_SIZE); return 0; } @@ -1620,7 +1686,7 @@ static unsigned int tile_net_tx_frags(lepp_frag_t *frags, if (b_len != 0) { if (!hash_default) - finv_buffer_remote(b_data, b_len); + finv_buffer_remote(b_data, b_len, 0); cpa = __pa(b_data); frags[n].cpa_lo = cpa; @@ -1643,7 +1709,7 @@ static unsigned int tile_net_tx_frags(lepp_frag_t *frags, if (!hash_default) { void *va = pfn_to_kaddr(pfn) + f->page_offset; BUG_ON(PageHighMem(f->page)); - finv_buffer_remote(va, f->size); + finv_buffer_remote(va, f->size, 0); } cpa = ((phys_addr_t)pfn << PAGE_SHIFT) + f->page_offset; @@ -1742,17 +1808,15 @@ static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev) unsigned long irqflags; - lepp_queue_t *eq = priv->epp_queue; + lepp_queue_t *eq = priv->eq; - struct sk_buff *olds[4]; - unsigned int wanted = 4; + struct sk_buff *olds[8]; + unsigned int wanted = 8; unsigned int i, nolds = 0; unsigned int cmd_head, cmd_tail, cmd_next; unsigned int comp_tail; - unsigned int free_slots; - /* Paranoia. */ BUG_ON(skb->protocol != htons(ETH_P_IP)); @@ -1780,34 +1844,32 @@ static int tile_net_tx_tso(struct sk_buff *skb, struct net_device *dev) /* Enqueue the command. */ - spin_lock_irqsave(&priv->cmd_lock, irqflags); + spin_lock_irqsave(&priv->eq_lock, irqflags); /* * Handle completions if needed to make room. * HACK: Spin until there is sufficient room. */ - free_slots = lepp_num_free_comp_slots(eq); - if (free_slots < 1) { -spin: - nolds += tile_net_lepp_grab_comps(dev, olds + nolds, - wanted - nolds, NULL); - if (lepp_num_free_comp_slots(eq) < 1) - goto spin; + if (lepp_num_free_comp_slots(eq) == 0) { + nolds = tile_net_lepp_grab_comps(eq, olds, wanted, 0); + if (nolds == 0) { +busy: + spin_unlock_irqrestore(&priv->eq_lock, irqflags); + return NETDEV_TX_BUSY; + } } cmd_head = eq->cmd_head; cmd_tail = eq->cmd_tail; - /* NOTE: The "gotos" below are untested. */ - /* Prepare to advance, detecting full queue. */ cmd_next = cmd_tail + cmd_size; if (cmd_tail < cmd_head && cmd_next >= cmd_head) - goto spin; + goto busy; if (cmd_next > LEPP_CMD_LIMIT) { cmd_next = 0; if (cmd_next == cmd_head) - goto spin; + goto busy; } /* Copy the command. */ @@ -1823,14 +1885,18 @@ spin: eq->comp_tail = comp_tail; /* Flush before allowing LEPP to handle the command. */ + /* ISSUE: Is this the optimal location for the flush? */ __insn_mf(); eq->cmd_tail = cmd_tail; - spin_unlock_irqrestore(&priv->cmd_lock, irqflags); - + /* NOTE: Using "4" here is more efficient than "0" or "2", */ + /* and, strangely, more efficient than pre-checking the number */ + /* of available completions, and comparing it to 4. */ if (nolds == 0) - nolds = tile_net_lepp_grab_comps(dev, olds, wanted, NULL); + nolds = tile_net_lepp_grab_comps(eq, olds, wanted, 4); + + spin_unlock_irqrestore(&priv->eq_lock, irqflags); /* Handle completions. */ for (i = 0; i < nolds; i++) @@ -1870,10 +1936,10 @@ static int tile_net_tx(struct sk_buff *skb, struct net_device *dev) unsigned int num_frags; - lepp_queue_t *eq = priv->epp_queue; + lepp_queue_t *eq = priv->eq; - struct sk_buff *olds[4]; - unsigned int wanted = 4; + struct sk_buff *olds[8]; + unsigned int wanted = 8; unsigned int i, nolds = 0; unsigned int cmd_size = sizeof(lepp_cmd_t); @@ -1883,8 +1949,6 @@ static int tile_net_tx(struct sk_buff *skb, struct net_device *dev) lepp_cmd_t cmds[LEPP_MAX_FRAGS]; - unsigned int free_slots; - /* * This is paranoia, since we think that if the link doesn't come @@ -1905,7 +1969,8 @@ static int tile_net_tx(struct sk_buff *skb, struct net_device *dev) if (hash_default) { HV_PTE pte = *virt_to_pte(current->mm, (unsigned long)data); if (hv_pte_get_mode(pte) != HV_PTE_MODE_CACHE_HASH_L3) - panic("Non-coherent egress buffer!"); + panic("Non-HFH egress buffer! VA=%p Mode=%d PTE=%llx", + data, hv_pte_get_mode(pte), hv_pte_val(pte)); } #endif #endif @@ -1958,37 +2023,35 @@ static int tile_net_tx(struct sk_buff *skb, struct net_device *dev) /* Enqueue the commands. */ - spin_lock_irqsave(&priv->cmd_lock, irqflags); + spin_lock_irqsave(&priv->eq_lock, irqflags); /* * Handle completions if needed to make room. * HACK: Spin until there is sufficient room. */ - free_slots = lepp_num_free_comp_slots(eq); - if (free_slots < 1) { -spin: - nolds += tile_net_lepp_grab_comps(dev, olds + nolds, - wanted - nolds, NULL); - if (lepp_num_free_comp_slots(eq) < 1) - goto spin; + if (lepp_num_free_comp_slots(eq) == 0) { + nolds = tile_net_lepp_grab_comps(eq, olds, wanted, 0); + if (nolds == 0) { +busy: + spin_unlock_irqrestore(&priv->eq_lock, irqflags); + return NETDEV_TX_BUSY; + } } cmd_head = eq->cmd_head; cmd_tail = eq->cmd_tail; - /* NOTE: The "gotos" below are untested. */ - /* Copy the commands, or fail. */ for (i = 0; i < num_frags; i++) { /* Prepare to advance, detecting full queue. */ cmd_next = cmd_tail + cmd_size; if (cmd_tail < cmd_head && cmd_next >= cmd_head) - goto spin; + goto busy; if (cmd_next > LEPP_CMD_LIMIT) { cmd_next = 0; if (cmd_next == cmd_head) - goto spin; + goto busy; } /* Copy the command. */ @@ -2005,14 +2068,18 @@ spin: eq->comp_tail = comp_tail; /* Flush before allowing LEPP to handle the command. */ + /* ISSUE: Is this the optimal location for the flush? */ __insn_mf(); eq->cmd_tail = cmd_tail; - spin_unlock_irqrestore(&priv->cmd_lock, irqflags); - + /* NOTE: Using "4" here is more efficient than "0" or "2", */ + /* and, strangely, more efficient than pre-checking the number */ + /* of available completions, and comparing it to 4. */ if (nolds == 0) - nolds = tile_net_lepp_grab_comps(dev, olds, wanted, NULL); + nolds = tile_net_lepp_grab_comps(eq, olds, wanted, 4); + + spin_unlock_irqrestore(&priv->eq_lock, irqflags); /* Handle completions. */ for (i = 0; i < nolds; i++) @@ -2261,7 +2328,6 @@ static struct net_device *tile_net_dev_init(const char *name) int ret; struct net_device *dev; struct tile_net_priv *priv; - struct page *page; /* * Allocate the device structure. This allocates "priv", calls @@ -2285,23 +2351,21 @@ static struct net_device *tile_net_dev_init(const char *name) INIT_DELAYED_WORK(&priv->retry_work, tile_net_open_retry); - spin_lock_init(&priv->cmd_lock); - spin_lock_init(&priv->comp_lock); + spin_lock_init(&priv->eq_lock); - /* Allocate "epp_queue". */ - BUG_ON(get_order(sizeof(lepp_queue_t)) != 0); - page = alloc_pages(GFP_KERNEL | __GFP_ZERO, 0); - if (!page) { + /* Allocate "eq". */ + priv->eq_pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, EQ_ORDER); + if (!priv->eq_pages) { free_netdev(dev); return NULL; } - priv->epp_queue = page_address(page); + priv->eq = page_address(priv->eq_pages); /* Register the network device. */ ret = register_netdev(dev); if (ret) { pr_err("register_netdev %s failed %d\n", dev->name, ret); - free_page((unsigned long)priv->epp_queue); + __free_pages(priv->eq_pages, EQ_ORDER); free_netdev(dev); return NULL; } @@ -2310,7 +2374,7 @@ static struct net_device *tile_net_dev_init(const char *name) ret = tile_net_get_mac(dev); if (ret < 0) { unregister_netdev(dev); - free_page((unsigned long)priv->epp_queue); + __free_pages(priv->eq_pages, EQ_ORDER); free_netdev(dev); return NULL; } @@ -2321,6 +2385,9 @@ static struct net_device *tile_net_dev_init(const char *name) /* * Module cleanup. + * + * FIXME: If compiled as a module, this module cannot be "unloaded", + * because the "ingress interrupt handler" is registered permanently. */ static void tile_net_cleanup(void) { @@ -2331,8 +2398,8 @@ static void tile_net_cleanup(void) struct net_device *dev = tile_net_devs[i]; struct tile_net_priv *priv = netdev_priv(dev); unregister_netdev(dev); - finv_buffer(priv->epp_queue, PAGE_SIZE); - free_page((unsigned long)priv->epp_queue); + finv_buffer(priv->eq, EQ_SIZE); + __free_pages(priv->eq_pages, EQ_ORDER); free_netdev(dev); } } @@ -2355,7 +2422,12 @@ static int tile_net_init_module(void) } +module_init(tile_net_init_module); +module_exit(tile_net_cleanup); + + #ifndef MODULE + /* * The "network_cpus" boot argument specifies the cpus that are dedicated * to handle ingress packets. @@ -2391,8 +2463,5 @@ static int __init network_cpus_setup(char *str) return 0; } __setup("network_cpus=", network_cpus_setup); -#endif - -module_init(tile_net_init_module); -module_exit(tile_net_cleanup); +#endif |