From 851b164231d1117673aa44c00c7622e48b7dfcf4 Mon Sep 17 00:00:00 2001 From: Alok Kataria Date: Tue, 13 Oct 2009 14:51:05 -0700 Subject: [SCSI] vmw_pvscsi: SCSI driver for VMware's virtual HBA. This is a driver for VMware's paravirtualized SCSI device, which should improve disk performance for guests running under control of VMware hypervisors that support such devices. Signed-off-by: Alok N Kataria Signed-off-by: James Bottomley --- drivers/scsi/vmw_pvscsi.c | 1407 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 1407 insertions(+) create mode 100644 drivers/scsi/vmw_pvscsi.c (limited to 'drivers/scsi/vmw_pvscsi.c') diff --git a/drivers/scsi/vmw_pvscsi.c b/drivers/scsi/vmw_pvscsi.c new file mode 100644 index 000000000000..d2604c813a20 --- /dev/null +++ b/drivers/scsi/vmw_pvscsi.c @@ -0,0 +1,1407 @@ +/* + * Linux driver for VMware's para-virtualized SCSI HBA. + * + * Copyright (C) 2008-2009, VMware, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms of the GNU General Public License as published by the + * Free Software Foundation; version 2 of the License and no later version. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY OR FITNESS FOR A PARTICULAR PURPOSE, GOOD TITLE or + * NON INFRINGEMENT. See the GNU General Public License for more + * details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA. + * + * Maintained by: Alok N Kataria + * + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include "vmw_pvscsi.h" + +#define PVSCSI_LINUX_DRIVER_DESC "VMware PVSCSI driver" + +MODULE_DESCRIPTION(PVSCSI_LINUX_DRIVER_DESC); +MODULE_AUTHOR("VMware, Inc."); +MODULE_LICENSE("GPL"); +MODULE_VERSION(PVSCSI_DRIVER_VERSION_STRING); + +#define PVSCSI_DEFAULT_NUM_PAGES_PER_RING 8 +#define PVSCSI_DEFAULT_NUM_PAGES_MSG_RING 1 +#define PVSCSI_DEFAULT_QUEUE_DEPTH 64 +#define SGL_SIZE PAGE_SIZE + +struct pvscsi_sg_list { + struct PVSCSISGElement sge[PVSCSI_MAX_NUM_SG_ENTRIES_PER_SEGMENT]; +}; + +struct pvscsi_ctx { + /* + * The index of the context in cmd_map serves as the context ID for a + * 1-to-1 mapping completions back to requests. + */ + struct scsi_cmnd *cmd; + struct pvscsi_sg_list *sgl; + struct list_head list; + dma_addr_t dataPA; + dma_addr_t sensePA; + dma_addr_t sglPA; +}; + +struct pvscsi_adapter { + char *mmioBase; + unsigned int irq; + u8 rev; + bool use_msi; + bool use_msix; + bool use_msg; + + spinlock_t hw_lock; + + struct workqueue_struct *workqueue; + struct work_struct work; + + struct PVSCSIRingReqDesc *req_ring; + unsigned req_pages; + unsigned req_depth; + dma_addr_t reqRingPA; + + struct PVSCSIRingCmpDesc *cmp_ring; + unsigned cmp_pages; + dma_addr_t cmpRingPA; + + struct PVSCSIRingMsgDesc *msg_ring; + unsigned msg_pages; + dma_addr_t msgRingPA; + + struct PVSCSIRingsState *rings_state; + dma_addr_t ringStatePA; + + struct pci_dev *dev; + struct Scsi_Host *host; + + struct list_head cmd_pool; + struct pvscsi_ctx *cmd_map; +}; + + +/* Command line parameters */ +static int pvscsi_ring_pages = PVSCSI_DEFAULT_NUM_PAGES_PER_RING; +static int pvscsi_msg_ring_pages = PVSCSI_DEFAULT_NUM_PAGES_MSG_RING; +static int pvscsi_cmd_per_lun = PVSCSI_DEFAULT_QUEUE_DEPTH; +static bool pvscsi_disable_msi; +static bool pvscsi_disable_msix; +static bool pvscsi_use_msg = true; + +#define PVSCSI_RW (S_IRUSR | S_IWUSR) + +module_param_named(ring_pages, pvscsi_ring_pages, int, PVSCSI_RW); +MODULE_PARM_DESC(ring_pages, "Number of pages per req/cmp ring - (default=" + __stringify(PVSCSI_DEFAULT_NUM_PAGES_PER_RING) ")"); + +module_param_named(msg_ring_pages, pvscsi_msg_ring_pages, int, PVSCSI_RW); +MODULE_PARM_DESC(msg_ring_pages, "Number of pages for the msg ring - (default=" + __stringify(PVSCSI_DEFAULT_NUM_PAGES_MSG_RING) ")"); + +module_param_named(cmd_per_lun, pvscsi_cmd_per_lun, int, PVSCSI_RW); +MODULE_PARM_DESC(cmd_per_lun, "Maximum commands per lun - (default=" + __stringify(PVSCSI_MAX_REQ_QUEUE_DEPTH) ")"); + +module_param_named(disable_msi, pvscsi_disable_msi, bool, PVSCSI_RW); +MODULE_PARM_DESC(disable_msi, "Disable MSI use in driver - (default=0)"); + +module_param_named(disable_msix, pvscsi_disable_msix, bool, PVSCSI_RW); +MODULE_PARM_DESC(disable_msix, "Disable MSI-X use in driver - (default=0)"); + +module_param_named(use_msg, pvscsi_use_msg, bool, PVSCSI_RW); +MODULE_PARM_DESC(use_msg, "Use msg ring when available - (default=1)"); + +static const struct pci_device_id pvscsi_pci_tbl[] = { + { PCI_VDEVICE(VMWARE, PCI_DEVICE_ID_VMWARE_PVSCSI) }, + { 0 } +}; + +MODULE_DEVICE_TABLE(pci, pvscsi_pci_tbl); + +static struct device * +pvscsi_dev(const struct pvscsi_adapter *adapter) +{ + return &(adapter->dev->dev); +} + +static struct pvscsi_ctx * +pvscsi_find_context(const struct pvscsi_adapter *adapter, struct scsi_cmnd *cmd) +{ + struct pvscsi_ctx *ctx, *end; + + end = &adapter->cmd_map[adapter->req_depth]; + for (ctx = adapter->cmd_map; ctx < end; ctx++) + if (ctx->cmd == cmd) + return ctx; + + return NULL; +} + +static struct pvscsi_ctx * +pvscsi_acquire_context(struct pvscsi_adapter *adapter, struct scsi_cmnd *cmd) +{ + struct pvscsi_ctx *ctx; + + if (list_empty(&adapter->cmd_pool)) + return NULL; + + ctx = list_first_entry(&adapter->cmd_pool, struct pvscsi_ctx, list); + ctx->cmd = cmd; + list_del(&ctx->list); + + return ctx; +} + +static void pvscsi_release_context(struct pvscsi_adapter *adapter, + struct pvscsi_ctx *ctx) +{ + ctx->cmd = NULL; + list_add(&ctx->list, &adapter->cmd_pool); +} + +/* + * Map a pvscsi_ctx struct to a context ID field value; we map to a simple + * non-zero integer. ctx always points to an entry in cmd_map array, hence + * the return value is always >=1. + */ +static u64 pvscsi_map_context(const struct pvscsi_adapter *adapter, + const struct pvscsi_ctx *ctx) +{ + return ctx - adapter->cmd_map + 1; +} + +static struct pvscsi_ctx * +pvscsi_get_context(const struct pvscsi_adapter *adapter, u64 context) +{ + return &adapter->cmd_map[context - 1]; +} + +static void pvscsi_reg_write(const struct pvscsi_adapter *adapter, + u32 offset, u32 val) +{ + writel(val, adapter->mmioBase + offset); +} + +static u32 pvscsi_reg_read(const struct pvscsi_adapter *adapter, u32 offset) +{ + return readl(adapter->mmioBase + offset); +} + +static u32 pvscsi_read_intr_status(const struct pvscsi_adapter *adapter) +{ + return pvscsi_reg_read(adapter, PVSCSI_REG_OFFSET_INTR_STATUS); +} + +static void pvscsi_write_intr_status(const struct pvscsi_adapter *adapter, + u32 val) +{ + pvscsi_reg_write(adapter, PVSCSI_REG_OFFSET_INTR_STATUS, val); +} + +static void pvscsi_unmask_intr(const struct pvscsi_adapter *adapter) +{ + u32 intr_bits; + + intr_bits = PVSCSI_INTR_CMPL_MASK; + if (adapter->use_msg) + intr_bits |= PVSCSI_INTR_MSG_MASK; + + pvscsi_reg_write(adapter, PVSCSI_REG_OFFSET_INTR_MASK, intr_bits); +} + +static void pvscsi_mask_intr(const struct pvscsi_adapter *adapter) +{ + pvscsi_reg_write(adapter, PVSCSI_REG_OFFSET_INTR_MASK, 0); +} + +static void pvscsi_write_cmd_desc(const struct pvscsi_adapter *adapter, + u32 cmd, const void *desc, size_t len) +{ + const u32 *ptr = desc; + size_t i; + + len /= sizeof(*ptr); + pvscsi_reg_write(adapter, PVSCSI_REG_OFFSET_COMMAND, cmd); + for (i = 0; i < len; i++) + pvscsi_reg_write(adapter, + PVSCSI_REG_OFFSET_COMMAND_DATA, ptr[i]); +} + +static void pvscsi_abort_cmd(const struct pvscsi_adapter *adapter, + const struct pvscsi_ctx *ctx) +{ + struct PVSCSICmdDescAbortCmd cmd = { 0 }; + + cmd.target = ctx->cmd->device->id; + cmd.context = pvscsi_map_context(adapter, ctx); + + pvscsi_write_cmd_desc(adapter, PVSCSI_CMD_ABORT_CMD, &cmd, sizeof(cmd)); +} + +static void pvscsi_kick_rw_io(const struct pvscsi_adapter *adapter) +{ + pvscsi_reg_write(adapter, PVSCSI_REG_OFFSET_KICK_RW_IO, 0); +} + +static void pvscsi_process_request_ring(const struct pvscsi_adapter *adapter) +{ + pvscsi_reg_write(adapter, PVSCSI_REG_OFFSET_KICK_NON_RW_IO, 0); +} + +static int scsi_is_rw(unsigned char op) +{ + return op == READ_6 || op == WRITE_6 || + op == READ_10 || op == WRITE_10 || + op == READ_12 || op == WRITE_12 || + op == READ_16 || op == WRITE_16; +} + +static void pvscsi_kick_io(const struct pvscsi_adapter *adapter, + unsigned char op) +{ + if (scsi_is_rw(op)) + pvscsi_kick_rw_io(adapter); + else + pvscsi_process_request_ring(adapter); +} + +static void ll_adapter_reset(const struct pvscsi_adapter *adapter) +{ + dev_dbg(pvscsi_dev(adapter), "Adapter Reset on %p\n", adapter); + + pvscsi_write_cmd_desc(adapter, PVSCSI_CMD_ADAPTER_RESET, NULL, 0); +} + +static void ll_bus_reset(const struct pvscsi_adapter *adapter) +{ + dev_dbg(pvscsi_dev(adapter), "Reseting bus on %p\n", adapter); + + pvscsi_write_cmd_desc(adapter, PVSCSI_CMD_RESET_BUS, NULL, 0); +} + +static void ll_device_reset(const struct pvscsi_adapter *adapter, u32 target) +{ + struct PVSCSICmdDescResetDevice cmd = { 0 }; + + dev_dbg(pvscsi_dev(adapter), "Reseting device: target=%u\n", target); + + cmd.target = target; + + pvscsi_write_cmd_desc(adapter, PVSCSI_CMD_RESET_DEVICE, + &cmd, sizeof(cmd)); +} + +static void pvscsi_create_sg(struct pvscsi_ctx *ctx, + struct scatterlist *sg, unsigned count) +{ + unsigned i; + struct PVSCSISGElement *sge; + + BUG_ON(count > PVSCSI_MAX_NUM_SG_ENTRIES_PER_SEGMENT); + + sge = &ctx->sgl->sge[0]; + for (i = 0; i < count; i++, sg++) { + sge[i].addr = sg_dma_address(sg); + sge[i].length = sg_dma_len(sg); + sge[i].flags = 0; + } +} + +/* + * Map all data buffers for a command into PCI space and + * setup the scatter/gather list if needed. + */ +static void pvscsi_map_buffers(struct pvscsi_adapter *adapter, + struct pvscsi_ctx *ctx, struct scsi_cmnd *cmd, + struct PVSCSIRingReqDesc *e) +{ + unsigned count; + unsigned bufflen = scsi_bufflen(cmd); + struct scatterlist *sg; + + e->dataLen = bufflen; + e->dataAddr = 0; + if (bufflen == 0) + return; + + sg = scsi_sglist(cmd); + count = scsi_sg_count(cmd); + if (count != 0) { + int segs = scsi_dma_map(cmd); + if (segs > 1) { + pvscsi_create_sg(ctx, sg, segs); + + e->flags |= PVSCSI_FLAG_CMD_WITH_SG_LIST; + ctx->sglPA = pci_map_single(adapter->dev, ctx->sgl, + SGL_SIZE, PCI_DMA_TODEVICE); + e->dataAddr = ctx->sglPA; + } else + e->dataAddr = sg_dma_address(sg); + } else { + /* + * In case there is no S/G list, scsi_sglist points + * directly to the buffer. + */ + ctx->dataPA = pci_map_single(adapter->dev, sg, bufflen, + cmd->sc_data_direction); + e->dataAddr = ctx->dataPA; + } +} + +static void pvscsi_unmap_buffers(const struct pvscsi_adapter *adapter, + struct pvscsi_ctx *ctx) +{ + struct scsi_cmnd *cmd; + unsigned bufflen; + + cmd = ctx->cmd; + bufflen = scsi_bufflen(cmd); + + if (bufflen != 0) { + unsigned count = scsi_sg_count(cmd); + + if (count != 0) { + scsi_dma_unmap(cmd); + if (ctx->sglPA) { + pci_unmap_single(adapter->dev, ctx->sglPA, + SGL_SIZE, PCI_DMA_TODEVICE); + ctx->sglPA = 0; + } + } else + pci_unmap_single(adapter->dev, ctx->dataPA, bufflen, + cmd->sc_data_direction); + } + if (cmd->sense_buffer) + pci_unmap_single(adapter->dev, ctx->sensePA, + SCSI_SENSE_BUFFERSIZE, PCI_DMA_FROMDEVICE); +} + +static int __devinit pvscsi_allocate_rings(struct pvscsi_adapter *adapter) +{ + adapter->rings_state = pci_alloc_consistent(adapter->dev, PAGE_SIZE, + &adapter->ringStatePA); + if (!adapter->rings_state) + return -ENOMEM; + + adapter->req_pages = min(PVSCSI_MAX_NUM_PAGES_REQ_RING, + pvscsi_ring_pages); + adapter->req_depth = adapter->req_pages + * PVSCSI_MAX_NUM_REQ_ENTRIES_PER_PAGE; + adapter->req_ring = pci_alloc_consistent(adapter->dev, + adapter->req_pages * PAGE_SIZE, + &adapter->reqRingPA); + if (!adapter->req_ring) + return -ENOMEM; + + adapter->cmp_pages = min(PVSCSI_MAX_NUM_PAGES_CMP_RING, + pvscsi_ring_pages); + adapter->cmp_ring = pci_alloc_consistent(adapter->dev, + adapter->cmp_pages * PAGE_SIZE, + &adapter->cmpRingPA); + if (!adapter->cmp_ring) + return -ENOMEM; + + BUG_ON(!IS_ALIGNED(adapter->ringStatePA, PAGE_SIZE)); + BUG_ON(!IS_ALIGNED(adapter->reqRingPA, PAGE_SIZE)); + BUG_ON(!IS_ALIGNED(adapter->cmpRingPA, PAGE_SIZE)); + + if (!adapter->use_msg) + return 0; + + adapter->msg_pages = min(PVSCSI_MAX_NUM_PAGES_MSG_RING, + pvscsi_msg_ring_pages); + adapter->msg_ring = pci_alloc_consistent(adapter->dev, + adapter->msg_pages * PAGE_SIZE, + &adapter->msgRingPA); + if (!adapter->msg_ring) + return -ENOMEM; + BUG_ON(!IS_ALIGNED(adapter->msgRingPA, PAGE_SIZE)); + + return 0; +} + +static void pvscsi_setup_all_rings(const struct pvscsi_adapter *adapter) +{ + struct PVSCSICmdDescSetupRings cmd = { 0 }; + dma_addr_t base; + unsigned i; + + cmd.ringsStatePPN = adapter->ringStatePA >> PAGE_SHIFT; + cmd.reqRingNumPages = adapter->req_pages; + cmd.cmpRingNumPages = adapter->cmp_pages; + + base = adapter->reqRingPA; + for (i = 0; i < adapter->req_pages; i++) { + cmd.reqRingPPNs[i] = base >> PAGE_SHIFT; + base += PAGE_SIZE; + } + + base = adapter->cmpRingPA; + for (i = 0; i < adapter->cmp_pages; i++) { + cmd.cmpRingPPNs[i] = base >> PAGE_SHIFT; + base += PAGE_SIZE; + } + + memset(adapter->rings_state, 0, PAGE_SIZE); + memset(adapter->req_ring, 0, adapter->req_pages * PAGE_SIZE); + memset(adapter->cmp_ring, 0, adapter->cmp_pages * PAGE_SIZE); + + pvscsi_write_cmd_desc(adapter, PVSCSI_CMD_SETUP_RINGS, + &cmd, sizeof(cmd)); + + if (adapter->use_msg) { + struct PVSCSICmdDescSetupMsgRing cmd_msg = { 0 }; + + cmd_msg.numPages = adapter->msg_pages; + + base = adapter->msgRingPA; + for (i = 0; i < adapter->msg_pages; i++) { + cmd_msg.ringPPNs[i] = base >> PAGE_SHIFT; + base += PAGE_SIZE; + } + memset(adapter->msg_ring, 0, adapter->msg_pages * PAGE_SIZE); + + pvscsi_write_cmd_desc(adapter, PVSCSI_CMD_SETUP_MSG_RING, + &cmd_msg, sizeof(cmd_msg)); + } +} + +/* + * Pull a completion descriptor off and pass the completion back + * to the SCSI mid layer. + */ +static void pvscsi_complete_request(struct pvscsi_adapter *adapter, + const struct PVSCSIRingCmpDesc *e) +{ + struct pvscsi_ctx *ctx; + struct scsi_cmnd *cmd; + u32 btstat = e->hostStatus; + u32 sdstat = e->scsiStatus; + + ctx = pvscsi_get_context(adapter, e->context); + cmd = ctx->cmd; + pvscsi_unmap_buffers(adapter, ctx); + pvscsi_release_context(adapter, ctx); + cmd->result = 0; + + if (sdstat != SAM_STAT_GOOD && + (btstat == BTSTAT_SUCCESS || + btstat == BTSTAT_LINKED_COMMAND_COMPLETED || + btstat == BTSTAT_LINKED_COMMAND_COMPLETED_WITH_FLAG)) { + cmd->result = (DID_OK << 16) | sdstat; + if (sdstat == SAM_STAT_CHECK_CONDITION && cmd->sense_buffer) + cmd->result |= (DRIVER_SENSE << 24); + } else + switch (btstat) { + case BTSTAT_SUCCESS: + case BTSTAT_LINKED_COMMAND_COMPLETED: + case BTSTAT_LINKED_COMMAND_COMPLETED_WITH_FLAG: + /* If everything went fine, let's move on.. */ + cmd->result = (DID_OK << 16); + break; + + case BTSTAT_DATARUN: + case BTSTAT_DATA_UNDERRUN: + /* Report residual data in underruns */ + scsi_set_resid(cmd, scsi_bufflen(cmd) - e->dataLen); + cmd->result = (DID_ERROR << 16); + break; + + case BTSTAT_SELTIMEO: + /* Our emulation returns this for non-connected devs */ + cmd->result = (DID_BAD_TARGET << 16); + break; + + case BTSTAT_LUNMISMATCH: + case BTSTAT_TAGREJECT: + case BTSTAT_BADMSG: + cmd->result = (DRIVER_INVALID << 24); + /* fall through */ + + case BTSTAT_HAHARDWARE: + case BTSTAT_INVPHASE: + case BTSTAT_HATIMEOUT: + case BTSTAT_NORESPONSE: + case BTSTAT_DISCONNECT: + case BTSTAT_HASOFTWARE: + case BTSTAT_BUSFREE: + case BTSTAT_SENSFAILED: + cmd->result |= (DID_ERROR << 16); + break; + + case BTSTAT_SENTRST: + case BTSTAT_RECVRST: + case BTSTAT_BUSRESET: + cmd->result = (DID_RESET << 16); + break; + + case BTSTAT_ABORTQUEUE: + cmd->result = (DID_ABORT << 16); + break; + + case BTSTAT_SCSIPARITY: + cmd->result = (DID_PARITY << 16); + break; + + default: + cmd->result = (DID_ERROR << 16); + scmd_printk(KERN_DEBUG, cmd, + "Unknown completion status: 0x%x\n", + btstat); + } + + dev_dbg(&cmd->device->sdev_gendev, + "cmd=%p %x ctx=%p result=0x%x status=0x%x,%x\n", + cmd, cmd->cmnd[0], ctx, cmd->result, btstat, sdstat); + + cmd->scsi_done(cmd); +} + +/* + * barrier usage : Since the PVSCSI device is emulated, there could be cases + * where we may want to serialize some accesses between the driver and the + * emulation layer. We use compiler barriers instead of the more expensive + * memory barriers because PVSCSI is only supported on X86 which has strong + * memory access ordering. + */ +static void pvscsi_process_completion_ring(struct pvscsi_adapter *adapter) +{ + struct PVSCSIRingsState *s = adapter->rings_state; + struct PVSCSIRingCmpDesc *ring = adapter->cmp_ring; + u32 cmp_entries = s->cmpNumEntriesLog2; + + while (s->cmpConsIdx != s->cmpProdIdx) { + struct PVSCSIRingCmpDesc *e = ring + (s->cmpConsIdx & + MASK(cmp_entries)); + /* + * This barrier() ensures that *e is not dereferenced while + * the device emulation still writes data into the slot. + * Since the device emulation advances s->cmpProdIdx only after + * updating the slot we want to check it first. + */ + barrier(); + pvscsi_complete_request(adapter, e); + /* + * This barrier() ensures that compiler doesn't reorder write + * to s->cmpConsIdx before the read of (*e) inside + * pvscsi_complete_request. Otherwise, device emulation may + * overwrite *e before we had a chance to read it. + */ + barrier(); + s->cmpConsIdx++; + } +} + +/* + * Translate a Linux SCSI request into a request ring entry. + */ +static int pvscsi_queue_ring(struct pvscsi_adapter *adapter, + struct pvscsi_ctx *ctx, struct scsi_cmnd *cmd) +{ + struct PVSCSIRingsState *s; + struct PVSCSIRingReqDesc *e; + struct scsi_device *sdev; + u32 req_entries; + + s = adapter->rings_state; + sdev = cmd->device; + req_entries = s->reqNumEntriesLog2; + + /* + * If this condition holds, we might have room on the request ring, but + * we might not have room on the completion ring for the response. + * However, we have already ruled out this possibility - we would not + * have successfully allocated a context if it were true, since we only + * have one context per request entry. Check for it anyway, since it + * would be a serious bug. + */ + if (s->reqProdIdx - s->cmpConsIdx >= 1 << req_entries) { + scmd_printk(KERN_ERR, cmd, "vmw_pvscsi: " + "ring full: reqProdIdx=%d cmpConsIdx=%d\n", + s->reqProdIdx, s->cmpConsIdx); + return -1; + } + + e = adapter->req_ring + (s->reqProdIdx & MASK(req_entries)); + + e->bus = sdev->channel; + e->target = sdev->id; + memset(e->lun, 0, sizeof(e->lun)); + e->lun[1] = sdev->lun; + + if (cmd->sense_buffer) { + ctx->sensePA = pci_map_single(adapter->dev, cmd->sense_buffer, + SCSI_SENSE_BUFFERSIZE, + PCI_DMA_FROMDEVICE); + e->senseAddr = ctx->sensePA; + e->senseLen = SCSI_SENSE_BUFFERSIZE; + } else { + e->senseLen = 0; + e->senseAddr = 0; + } + e->cdbLen = cmd->cmd_len; + e->vcpuHint = smp_processor_id(); + memcpy(e->cdb, cmd->cmnd, e->cdbLen); + + e->tag = SIMPLE_QUEUE_TAG; + if (sdev->tagged_supported && + (cmd->tag == HEAD_OF_QUEUE_TAG || + cmd->tag == ORDERED_QUEUE_TAG)) + e->tag = cmd->tag; + + if (cmd->sc_data_direction == DMA_FROM_DEVICE) + e->flags = PVSCSI_FLAG_CMD_DIR_TOHOST; + else if (cmd->sc_data_direction == DMA_TO_DEVICE) + e->flags = PVSCSI_FLAG_CMD_DIR_TODEVICE; + else if (cmd->sc_data_direction == DMA_NONE) + e->flags = PVSCSI_FLAG_CMD_DIR_NONE; + else + e->flags = 0; + + pvscsi_map_buffers(adapter, ctx, cmd, e); + + e->context = pvscsi_map_context(adapter, ctx); + + barrier(); + + s->reqProdIdx++; + + return 0; +} + +static int pvscsi_queue(struct scsi_cmnd *cmd, void (*done)(struct scsi_cmnd *)) +{ + struct Scsi_Host *host = cmd->device->host; + struct pvscsi_adapter *adapter = shost_priv(host); + struct pvscsi_ctx *ctx; + unsigned long flags; + + spin_lock_irqsave(&adapter->hw_lock, flags); + + ctx = pvscsi_acquire_context(adapter, cmd); + if (!ctx || pvscsi_queue_ring(adapter, ctx, cmd) != 0) { + if (ctx) + pvscsi_release_context(adapter, ctx); + spin_unlock_irqrestore(&adapter->hw_lock, flags); + return SCSI_MLQUEUE_HOST_BUSY; + } + + cmd->scsi_done = done; + + dev_dbg(&cmd->device->sdev_gendev, + "queued cmd %p, ctx %p, op=%x\n", cmd, ctx, cmd->cmnd[0]); + + spin_unlock_irqrestore(&adapter->hw_lock, flags); + + pvscsi_kick_io(adapter, cmd->cmnd[0]); + + return 0; +} + +static int pvscsi_abort(struct scsi_cmnd *cmd) +{ + struct pvscsi_adapter *adapter = shost_priv(cmd->device->host); + struct pvscsi_ctx *ctx; + unsigned long flags; + + scmd_printk(KERN_DEBUG, cmd, "task abort on host %u, %p\n", + adapter->host->host_no, cmd); + + spin_lock_irqsave(&adapter->hw_lock, flags); + + /* + * Poll the completion ring first - we might be trying to abort + * a command that is waiting to be dispatched in the completion ring. + */ + pvscsi_process_completion_ring(adapter); + + /* + * If there is no context for the command, it either already succeeded + * or else was never properly issued. Not our problem. + */ + ctx = pvscsi_find_context(adapter, cmd); + if (!ctx) { + scmd_printk(KERN_DEBUG, cmd, "Failed to abort cmd %p\n", cmd); + goto out; + } + + pvscsi_abort_cmd(adapter, ctx); + + pvscsi_process_completion_ring(adapter); + +out: + spin_unlock_irqrestore(&adapter->hw_lock, flags); + return SUCCESS; +} + +/* + * Abort all outstanding requests. This is only safe to use if the completion + * ring will never be walked again or the device has been reset, because it + * destroys the 1-1 mapping between context field passed to emulation and our + * request structure. + */ +static void pvscsi_reset_all(struct pvscsi_adapter *adapter) +{ + unsigned i; + + for (i = 0; i < adapter->req_depth; i++) { + struct pvscsi_ctx *ctx = &adapter->cmd_map[i]; + struct scsi_cmnd *cmd = ctx->cmd; + if (cmd) { + scmd_printk(KERN_ERR, cmd, + "Forced reset on cmd %p\n", cmd); + pvscsi_unmap_buffers(adapter, ctx); + pvscsi_release_context(adapter, ctx); + cmd->result = (DID_RESET << 16); + cmd->scsi_done(cmd); + } + } +} + +static int pvscsi_host_reset(struct scsi_cmnd *cmd) +{ + struct Scsi_Host *host = cmd->device->host; + struct pvscsi_adapter *adapter = shost_priv(host); + unsigned long flags; + bool use_msg; + + scmd_printk(KERN_INFO, cmd, "SCSI Host reset\n"); + + spin_lock_irqsave(&adapter->hw_lock, flags); + + use_msg = adapter->use_msg; + + if (use_msg) { + adapter->use_msg = 0; + spin_unlock_irqrestore(&adapter->hw_lock, flags); + + /* + * Now that we know that the ISR won't add more work on the + * workqueue we can safely flush any outstanding work. + */ + flush_workqueue(adapter->workqueue); + spin_lock_irqsave(&adapter->hw_lock, flags); + } + + /* + * We're going to tear down the entire ring structure and set it back + * up, so stalling new requests until all completions are flushed and + * the rings are back in place. + */ + + pvscsi_process_request_ring(adapter); + + ll_adapter_reset(adapter); + + /* + * Now process any completions. Note we do this AFTER adapter reset, + * which is strange, but stops races where completions get posted + * between processing the ring and issuing the reset. The backend will + * not touch the ring memory after reset, so the immediately pre-reset + * completion ring state is still valid. + */ + pvscsi_process_completion_ring(adapter); + + pvscsi_reset_all(adapter); + adapter->use_msg = use_msg; + pvscsi_setup_all_rings(adapter); + pvscsi_unmask_intr(adapter); + + spin_unlock_irqrestore(&adapter->hw_lock, flags); + + return SUCCESS; +} + +static int pvscsi_bus_reset(struct scsi_cmnd *cmd) +{ + struct Scsi_Host *host = cmd->device->host; + struct pvscsi_adapter *adapter = shost_priv(host); + unsigned long flags; + + scmd_printk(KERN_INFO, cmd, "SCSI Bus reset\n"); + + /* + * We don't want to queue new requests for this bus after + * flushing all pending requests to emulation, since new + * requests could then sneak in during this bus reset phase, + * so take the lock now. + */ + spin_lock_irqsave(&adapter->hw_lock, flags); + + pvscsi_process_request_ring(adapter); + ll_bus_reset(adapter); + pvscsi_process_completion_ring(adapter); + + spin_unlock_irqrestore(&adapter->hw_lock, flags); + + return SUCCESS; +} + +static int pvscsi_device_reset(struct scsi_cmnd *cmd) +{ + struct Scsi_Host *host = cmd->device->host; + struct pvscsi_adapter *adapter = shost_priv(host); + unsigned long flags; + + scmd_printk(KERN_INFO, cmd, "SCSI device reset on scsi%u:%u\n", + host->host_no, cmd->device->id); + + /* + * We don't want to queue new requests for this device after flushing + * all pending requests to emulation, since new requests could then + * sneak in during this device reset phase, so take the lock now. + */ + spin_lock_irqsave(&adapter->hw_lock, flags); + + pvscsi_process_request_ring(adapter); + ll_device_reset(adapter, cmd->device->id); + pvscsi_process_completion_ring(adapter); + + spin_unlock_irqrestore(&adapter->hw_lock, flags); + + return SUCCESS; +} + +static struct scsi_host_template pvscsi_template; + +static const char *pvscsi_info(struct Scsi_Host *host) +{ + struct pvscsi_adapter *adapter = shost_priv(host); + static char buf[256]; + + sprintf(buf, "VMware PVSCSI storage adapter rev %d, req/cmp/msg rings: " + "%u/%u/%u pages, cmd_per_lun=%u", adapter->rev, + adapter->req_pages, adapter->cmp_pages, adapter->msg_pages, + pvscsi_template.cmd_per_lun); + + return buf; +} + +static struct scsi_host_template pvscsi_template = { + .module = THIS_MODULE, + .name = "VMware PVSCSI Host Adapter", + .proc_name = "vmw_pvscsi", + .info = pvscsi_info, + .queuecommand = pvscsi_queue, + .this_id = -1, + .sg_tablesize = PVSCSI_MAX_NUM_SG_ENTRIES_PER_SEGMENT, + .dma_boundary = UINT_MAX, + .max_sectors = 0xffff, + .use_clustering = ENABLE_CLUSTERING, + .eh_abort_handler = pvscsi_abort, + .eh_device_reset_handler = pvscsi_device_reset, + .eh_bus_reset_handler = pvscsi_bus_reset, + .eh_host_reset_handler = pvscsi_host_reset, +}; + +static void pvscsi_process_msg(const struct pvscsi_adapter *adapter, + const struct PVSCSIRingMsgDesc *e) +{ + struct PVSCSIRingsState *s = adapter->rings_state; + struct Scsi_Host *host = adapter->host; + struct scsi_device *sdev; + + printk(KERN_INFO "vmw_pvscsi: msg type: 0x%x - MSG RING: %u/%u (%u) \n", + e->type, s->msgProdIdx, s->msgConsIdx, s->msgNumEntriesLog2); + + BUILD_BUG_ON(PVSCSI_MSG_LAST != 2); + + if (e->type == PVSCSI_MSG_DEV_ADDED) { + struct PVSCSIMsgDescDevStatusChanged *desc; + desc = (struct PVSCSIMsgDescDevStatusChanged *)e; + + printk(KERN_INFO + "vmw_pvscsi: msg: device added at scsi%u:%u:%u\n", + desc->bus, desc->target, desc->lun[1]); + + if (!scsi_host_get(host)) + return; + + sdev = scsi_device_lookup(host, desc->bus, desc->target, + desc->lun[1]); + if (sdev) { + printk(KERN_INFO "vmw_pvscsi: device already exists\n"); + scsi_device_put(sdev); + } else + scsi_add_device(adapter->host, desc->bus, + desc->target, desc->lun[1]); + + scsi_host_put(host); + } else if (e->type == PVSCSI_MSG_DEV_REMOVED) { + struct PVSCSIMsgDescDevStatusChanged *desc; + desc = (struct PVSCSIMsgDescDevStatusChanged *)e; + + printk(KERN_INFO + "vmw_pvscsi: msg: device removed at scsi%u:%u:%u\n", + desc->bus, desc->target, desc->lun[1]); + + if (!scsi_host_get(host)) + return; + + sdev = scsi_device_lookup(host, desc->bus, desc->target, + desc->lun[1]); + if (sdev) { + scsi_remove_device(sdev); + scsi_device_put(sdev); + } else + printk(KERN_INFO + "vmw_pvscsi: failed to lookup scsi%u:%u:%u\n", + desc->bus, desc->target, desc->lun[1]); + + scsi_host_put(host); + } +} + +static int pvscsi_msg_pending(const struct pvscsi_adapter *adapter) +{ + struct PVSCSIRingsState *s = adapter->rings_state; + + return s->msgProdIdx != s->msgConsIdx; +} + +static void pvscsi_process_msg_ring(const struct pvscsi_adapter *adapter) +{ + struct PVSCSIRingsState *s = adapter->rings_state; + struct PVSCSIRingMsgDesc *ring = adapter->msg_ring; + u32 msg_entries = s->msgNumEntriesLog2; + + while (pvscsi_msg_pending(adapter)) { + struct PVSCSIRingMsgDesc *e = ring + (s->msgConsIdx & + MASK(msg_entries)); + + barrier(); + pvscsi_process_msg(adapter, e); + barrier(); + s->msgConsIdx++; + } +} + +static void pvscsi_msg_workqueue_handler(struct work_struct *data) +{ + struct pvscsi_adapter *adapter; + + adapter = container_of(data, struct pvscsi_adapter, work); + + pvscsi_process_msg_ring(adapter); +} + +static int pvscsi_setup_msg_workqueue(struct pvscsi_adapter *adapter) +{ + char name[32]; + + if (!pvscsi_use_msg) + return 0; + + pvscsi_reg_write(adapter, PVSCSI_REG_OFFSET_COMMAND, + PVSCSI_CMD_SETUP_MSG_RING); + + if (pvscsi_reg_read(adapter, PVSCSI_REG_OFFSET_COMMAND_STATUS) == -1) + return 0; + + snprintf(name, sizeof(name), + "vmw_pvscsi_wq_%u", adapter->host->host_no); + + adapter->workqueue = create_singlethread_workqueue(name); + if (!adapter->workqueue) { + printk(KERN_ERR "vmw_pvscsi: failed to create work queue\n"); + return 0; + } + INIT_WORK(&adapter->work, pvscsi_msg_workqueue_handler); + + return 1; +} + +static irqreturn_t pvscsi_isr(int irq, void *devp) +{ + struct pvscsi_adapter *adapter = devp; + int handled; + + if (adapter->use_msi || adapter->use_msix) + handled = true; + else { + u32 val = pvscsi_read_intr_status(adapter); + handled = (val & PVSCSI_INTR_ALL_SUPPORTED) != 0; + if (handled) + pvscsi_write_intr_status(devp, val); + } + + if (handled) { + unsigned long flags; + + spin_lock_irqsave(&adapter->hw_lock, flags); + + pvscsi_process_completion_ring(adapter); + if (adapter->use_msg && pvscsi_msg_pending(adapter)) + queue_work(adapter->workqueue, &adapter->work); + + spin_unlock_irqrestore(&adapter->hw_lock, flags); + } + + return IRQ_RETVAL(handled); +} + +static void pvscsi_free_sgls(const struct pvscsi_adapter *adapter) +{ + struct pvscsi_ctx *ctx = adapter->cmd_map; + unsigned i; + + for (i = 0; i < adapter->req_depth; ++i, ++ctx) + free_pages((unsigned long)ctx->sgl, get_order(SGL_SIZE)); +} + +static int pvscsi_setup_msix(const struct pvscsi_adapter *adapter, int *irq) +{ + struct msix_entry entry = { 0, PVSCSI_VECTOR_COMPLETION }; + int ret; + + ret = pci_enable_msix(adapter->dev, &entry, 1); + if (ret) + return ret; + + *irq = entry.vector; + + return 0; +} + +static void pvscsi_shutdown_intr(struct pvscsi_adapter *adapter) +{ + if (adapter->irq) { + free_irq(adapter->irq, adapter); + adapter->irq = 0; + } + if (adapter->use_msi) { + pci_disable_msi(adapter->dev); + adapter->use_msi = 0; + } else if (adapter->use_msix) { + pci_disable_msix(adapter->dev); + adapter->use_msix = 0; + } +} + +static void pvscsi_release_resources(struct pvscsi_adapter *adapter) +{ + pvscsi_shutdown_intr(adapter); + + if (adapter->workqueue) + destroy_workqueue(adapter->workqueue); + + if (adapter->mmioBase) + pci_iounmap(adapter->dev, adapter->mmioBase); + + pci_release_regions(adapter->dev); + + if (adapter->cmd_map) { + pvscsi_free_sgls(adapter); + kfree(adapter->cmd_map); + } + + if (adapter->rings_state) + pci_free_consistent(adapter->dev, PAGE_SIZE, + adapter->rings_state, adapter->ringStatePA); + + if (adapter->req_ring) + pci_free_consistent(adapter->dev, + adapter->req_pages * PAGE_SIZE, + adapter->req_ring, adapter->reqRingPA); + + if (adapter->cmp_ring) + pci_free_consistent(adapter->dev, + adapter->cmp_pages * PAGE_SIZE, + adapter->cmp_ring, adapter->cmpRingPA); + + if (adapter->msg_ring) + pci_free_consistent(adapter->dev, + adapter->msg_pages * PAGE_SIZE, + adapter->msg_ring, adapter->msgRingPA); +} + +/* + * Allocate scatter gather lists. + * + * These are statically allocated. Trying to be clever was not worth it. + * + * Dynamic allocation can fail, and we can't go deeep into the memory + * allocator, since we're a SCSI driver, and trying too hard to allocate + * memory might generate disk I/O. We also don't want to fail disk I/O + * in that case because we can't get an allocation - the I/O could be + * trying to swap out data to free memory. Since that is pathological, + * just use a statically allocated scatter list. + * + */ +static int __devinit pvscsi_allocate_sg(struct pvscsi_adapter *adapter) +{ + struct pvscsi_ctx *ctx; + int i; + + ctx = adapter->cmd_map; + BUILD_BUG_ON(sizeof(struct pvscsi_sg_list) > SGL_SIZE); + + for (i = 0; i < adapter->req_depth; ++i, ++ctx) { + ctx->sgl = (void *)__get_free_pages(GFP_KERNEL, + get_order(SGL_SIZE)); + ctx->sglPA = 0; + BUG_ON(!IS_ALIGNED(((unsigned long)ctx->sgl), PAGE_SIZE)); + if (!ctx->sgl) { + for (; i >= 0; --i, --ctx) { + free_pages((unsigned long)ctx->sgl, + get_order(SGL_SIZE)); + ctx->sgl = NULL; + } + return -ENOMEM; + } + } + + return 0; +} + +static int __devinit pvscsi_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + struct pvscsi_adapter *adapter; + struct Scsi_Host *host; + unsigned int i; + unsigned long flags = 0; + int error; + + error = -ENODEV; + + if (pci_enable_device(pdev)) + return error; + + if (pci_set_dma_mask(pdev, DMA_BIT_MASK(64)) == 0 && + pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)) == 0) { + printk(KERN_INFO "vmw_pvscsi: using 64bit dma\n"); + } else if (pci_set_dma_mask(pdev, DMA_BIT_MASK(32)) == 0 && + pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)) == 0) { + printk(KERN_INFO "vmw_pvscsi: using 32bit dma\n"); + } else { + printk(KERN_ERR "vmw_pvscsi: failed to set DMA mask\n"); + goto out_disable_device; + } + + pvscsi_template.can_queue = + min(PVSCSI_MAX_NUM_PAGES_REQ_RING, pvscsi_ring_pages) * + PVSCSI_MAX_NUM_REQ_ENTRIES_PER_PAGE; + pvscsi_template.cmd_per_lun = + min(pvscsi_template.can_queue, pvscsi_cmd_per_lun); + host = scsi_host_alloc(&pvscsi_template, sizeof(struct pvscsi_adapter)); + if (!host) { + printk(KERN_ERR "vmw_pvscsi: failed to allocate host\n"); + goto out_disable_device; + } + + adapter = shost_priv(host); + memset(adapter, 0, sizeof(*adapter)); + adapter->dev = pdev; + adapter->host = host; + + spin_lock_init(&adapter->hw_lock); + + host->max_channel = 0; + host->max_id = 16; + host->max_lun = 1; + host->max_cmd_len = 16; + + adapter->rev = pdev->revision; + + if (pci_request_regions(pdev, "vmw_pvscsi")) { + printk(KERN_ERR "vmw_pvscsi: pci memory selection failed\n"); + goto out_free_host; + } + + for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) { + if ((pci_resource_flags(pdev, i) & PCI_BASE_ADDRESS_SPACE_IO)) + continue; + + if (pci_resource_len(pdev, i) < PVSCSI_MEM_SPACE_SIZE) + continue; + + break; + } + + if (i == DEVICE_COUNT_RESOURCE) { + printk(KERN_ERR + "vmw_pvscsi: adapter has no suitable MMIO region\n"); + goto out_release_resources; + } + + adapter->mmioBase = pci_iomap(pdev, i, PVSCSI_MEM_SPACE_SIZE); + + if (!adapter->mmioBase) { + printk(KERN_ERR + "vmw_pvscsi: can't iomap for BAR %d memsize %lu\n", + i, PVSCSI_MEM_SPACE_SIZE); + goto out_release_resources; + } + + pci_set_master(pdev); + pci_set_drvdata(pdev, host); + + ll_adapter_reset(adapter); + + adapter->use_msg = pvscsi_setup_msg_workqueue(adapter); + + error = pvscsi_allocate_rings(adapter); + if (error) { + printk(KERN_ERR "vmw_pvscsi: unable to allocate ring memory\n"); + goto out_release_resources; + } + + /* + * From this point on we should reset the adapter if anything goes + * wrong. + */ + pvscsi_setup_all_rings(adapter); + + adapter->cmd_map = kcalloc(adapter->req_depth, + sizeof(struct pvscsi_ctx), GFP_KERNEL); + if (!adapter->cmd_map) { + printk(KERN_ERR "vmw_pvscsi: failed to allocate memory.\n"); + error = -ENOMEM; + goto out_reset_adapter; + } + + INIT_LIST_HEAD(&adapter->cmd_pool); + for (i = 0; i < adapter->req_depth; i++) { + struct pvscsi_ctx *ctx = adapter->cmd_map + i; + list_add(&ctx->list, &adapter->cmd_pool); + } + + error = pvscsi_allocate_sg(adapter); + if (error) { + printk(KERN_ERR "vmw_pvscsi: unable to allocate s/g table\n"); + goto out_reset_adapter; + } + + if (!pvscsi_disable_msix && + pvscsi_setup_msix(adapter, &adapter->irq) == 0) { + printk(KERN_INFO "vmw_pvscsi: using MSI-X\n"); + adapter->use_msix = 1; + } else if (!pvscsi_disable_msi && pci_enable_msi(pdev) == 0) { + printk(KERN_INFO "vmw_pvscsi: using MSI\n"); + adapter->use_msi = 1; + adapter->irq = pdev->irq; + } else { + printk(KERN_INFO "vmw_pvscsi: using INTx\n"); + adapter->irq = pdev->irq; + flags = IRQF_SHARED; + } + + error = request_irq(adapter->irq, pvscsi_isr, flags, + "vmw_pvscsi", adapter); + if (error) { + printk(KERN_ERR + "vmw_pvscsi: unable to request IRQ: %d\n", error); + adapter->irq = 0; + goto out_reset_adapter; + } + + error = scsi_add_host(host, &pdev->dev); + if (error) { + printk(KERN_ERR + "vmw_pvscsi: scsi_add_host failed: %d\n", error); + goto out_reset_adapter; + } + + dev_info(&pdev->dev, "VMware PVSCSI rev %d host #%u\n", + adapter->rev, host->host_no); + + pvscsi_unmask_intr(adapter); + + scsi_scan_host(host); + + return 0; + +out_reset_adapter: + ll_adapter_reset(adapter); +out_release_resources: + pvscsi_release_resources(adapter); +out_free_host: + scsi_host_put(host); +out_disable_device: + pci_set_drvdata(pdev, NULL); + pci_disable_device(pdev); + + return error; +} + +static void __pvscsi_shutdown(struct pvscsi_adapter *adapter) +{ + pvscsi_mask_intr(adapter); + + if (adapter->workqueue) + flush_workqueue(adapter->workqueue); + + pvscsi_shutdown_intr(adapter); + + pvscsi_process_request_ring(adapter); + pvscsi_process_completion_ring(adapter); + ll_adapter_reset(adapter); +} + +static void pvscsi_shutdown(struct pci_dev *dev) +{ + struct Scsi_Host *host = pci_get_drvdata(dev); + struct pvscsi_adapter *adapter = shost_priv(host); + + __pvscsi_shutdown(adapter); +} + +static void pvscsi_remove(struct pci_dev *pdev) +{ + struct Scsi_Host *host = pci_get_drvdata(pdev); + struct pvscsi_adapter *adapter = shost_priv(host); + + scsi_remove_host(host); + + __pvscsi_shutdown(adapter); + pvscsi_release_resources(adapter); + + scsi_host_put(host); + + pci_set_drvdata(pdev, NULL); + pci_disable_device(pdev); +} + +static struct pci_driver pvscsi_pci_driver = { + .name = "vmw_pvscsi", + .id_table = pvscsi_pci_tbl, + .probe = pvscsi_probe, + .remove = __devexit_p(pvscsi_remove), + .shutdown = pvscsi_shutdown, +}; + +static int __init pvscsi_init(void) +{ + pr_info("%s - version %s\n", + PVSCSI_LINUX_DRIVER_DESC, PVSCSI_DRIVER_VERSION_STRING); + return pci_register_driver(&pvscsi_pci_driver); +} + +static void __exit pvscsi_exit(void) +{ + pci_unregister_driver(&pvscsi_pci_driver); +} + +module_init(pvscsi_init); +module_exit(pvscsi_exit); -- cgit v1.2.3