diff options
Diffstat (limited to 'drivers/block/nvme-core.c')
-rw-r--r-- | drivers/block/nvme-core.c | 812 |
1 files changed, 436 insertions, 376 deletions
diff --git a/drivers/block/nvme-core.c b/drivers/block/nvme-core.c index 683dff272562..e5112714188f 100644 --- a/drivers/block/nvme-core.c +++ b/drivers/block/nvme-core.c @@ -29,6 +29,7 @@ #include <linux/kdev_t.h> #include <linux/kthread.h> #include <linux/kernel.h> +#include <linux/list_sort.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/moduleparam.h> @@ -80,6 +81,7 @@ static wait_queue_head_t nvme_kthread_wait; static struct class *nvme_class; static void nvme_reset_failed_dev(struct work_struct *ws); +static int nvme_reset(struct nvme_dev *dev); static int nvme_process_cq(struct nvme_queue *nvmeq); struct async_cmd_info { @@ -102,6 +104,7 @@ struct nvme_queue { spinlock_t q_lock; struct nvme_command *sq_cmds; volatile struct nvme_completion *cqes; + struct blk_mq_tags **tags; dma_addr_t sq_dma_addr; dma_addr_t cq_dma_addr; u32 __iomem *q_db; @@ -114,7 +117,6 @@ struct nvme_queue { u8 cq_phase; u8 cqe_seen; struct async_cmd_info cmdinfo; - struct blk_mq_hw_ctx *hctx; }; /* @@ -182,9 +184,12 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, struct nvme_dev *dev = data; struct nvme_queue *nvmeq = dev->queues[0]; - WARN_ON(nvmeq->hctx); - nvmeq->hctx = hctx; + WARN_ON(hctx_idx != 0); + WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); + WARN_ON(nvmeq->tags); + hctx->driver_data = nvmeq; + nvmeq->tags = &dev->admin_tagset.tags[0]; return 0; } @@ -201,27 +206,16 @@ static int nvme_admin_init_request(void *data, struct request *req, return 0; } -static void nvme_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) -{ - struct nvme_queue *nvmeq = hctx->driver_data; - - nvmeq->hctx = NULL; -} - static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { struct nvme_dev *dev = data; - struct nvme_queue *nvmeq = dev->queues[ - (hctx_idx % dev->queue_count) + 1]; - - if (!nvmeq->hctx) - nvmeq->hctx = hctx; + struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; - /* nvmeq queues are shared between namespaces. We assume here that - * blk-mq map the tags so they match up with the nvme queue tags. */ - WARN_ON(nvmeq->hctx->tags != hctx->tags); + if (!nvmeq->tags) + nvmeq->tags = &dev->tagset.tags[hctx_idx]; + WARN_ON(dev->tagset.tags[hctx_idx] != hctx->tags); hctx->driver_data = nvmeq; return 0; } @@ -307,9 +301,16 @@ static void async_req_completion(struct nvme_queue *nvmeq, void *ctx, if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) ++nvmeq->dev->event_limit; - if (status == NVME_SC_SUCCESS) - dev_warn(nvmeq->q_dmadev, - "async event result %08x\n", result); + if (status != NVME_SC_SUCCESS) + return; + + switch (result & 0xff07) { + case NVME_AER_NOTICE_NS_CHANGED: + dev_info(nvmeq->q_dmadev, "rescanning\n"); + schedule_work(&nvmeq->dev->scan_work); + default: + dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result); + } } static void abort_completion(struct nvme_queue *nvmeq, void *ctx, @@ -320,7 +321,7 @@ static void abort_completion(struct nvme_queue *nvmeq, void *ctx, u16 status = le16_to_cpup(&cqe->status) >> 1; u32 result = le32_to_cpup(&cqe->result); - blk_mq_free_hctx_request(nvmeq->hctx, req); + blk_mq_free_request(req); dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); ++nvmeq->dev->abort_limit; @@ -333,14 +334,13 @@ static void async_completion(struct nvme_queue *nvmeq, void *ctx, cmdinfo->result = le32_to_cpup(&cqe->result); cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; queue_kthread_work(cmdinfo->worker, &cmdinfo->work); - blk_mq_free_hctx_request(nvmeq->hctx, cmdinfo->req); + blk_mq_free_request(cmdinfo->req); } static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, unsigned int tag) { - struct blk_mq_hw_ctx *hctx = nvmeq->hctx; - struct request *req = blk_mq_tag_to_rq(hctx->tags, tag); + struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag); return blk_mq_rq_to_pdu(req); } @@ -445,7 +445,7 @@ static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev, (unsigned long) rq, gfp); } -void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) +static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) { const int last_prp = dev->page_size / 8 - 1; int i; @@ -605,22 +605,30 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, spin_unlock_irqrestore(req->q->queue_lock, flags); return; } - req->errors = nvme_error_status(status); + if (req->cmd_type == REQ_TYPE_DRV_PRIV) { + req->errors = status; + } else { + req->errors = nvme_error_status(status); + } } else req->errors = 0; + if (req->cmd_type == REQ_TYPE_DRV_PRIV) { + u32 result = le32_to_cpup(&cqe->result); + req->special = (void *)(uintptr_t)result; + } if (cmd_rq->aborted) - dev_warn(&nvmeq->dev->pci_dev->dev, + dev_warn(nvmeq->dev->dev, "completing aborted command with status:%04x\n", status); if (iod->nents) { - dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, iod->nents, + dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents, rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); if (blk_integrity_rq(req)) { if (!rq_data_dir(req)) nvme_dif_remap(req, nvme_dif_complete); - dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->meta_sg, 1, + dma_unmap_sg(nvmeq->dev->dev, iod->meta_sg, 1, rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); } } @@ -630,8 +638,8 @@ static void req_completion(struct nvme_queue *nvmeq, void *ctx, } /* length is in bytes. gfp flags indicates whether we may sleep. */ -int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, - gfp_t gfp) +static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, + int total_len, gfp_t gfp) { struct dma_pool *pool; int length = total_len; @@ -709,6 +717,23 @@ int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, int total_len, return total_len; } +static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req, + struct nvme_iod *iod) +{ + struct nvme_command *cmnd = &nvmeq->sq_cmds[nvmeq->sq_tail]; + + memcpy(cmnd, req->cmd, sizeof(struct nvme_command)); + cmnd->rw.command_id = req->tag; + if (req->nr_phys_segments) { + cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); + } + + if (++nvmeq->sq_tail == nvmeq->q_depth) + nvmeq->sq_tail = 0; + writel(nvmeq->sq_tail, nvmeq->q_db); +} + /* * We reuse the small pool to allocate the 16-byte range here as it is not * worth having a special pool for these or additional cases to handle freeing @@ -807,11 +832,15 @@ static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, return 0; } +/* + * NOTE: ns is NULL when called on the admin queue. + */ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, const struct blk_mq_queue_data *bd) { struct nvme_ns *ns = hctx->queue->queuedata; struct nvme_queue *nvmeq = hctx->driver_data; + struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); struct nvme_iod *iod; @@ -822,15 +851,16 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, * unless this namespace is formated such that the metadata can be * stripped/generated by the controller with PRACT=1. */ - if (ns->ms && !blk_integrity_rq(req)) { - if (!(ns->pi_type && ns->ms == 8)) { + if (ns && ns->ms && !blk_integrity_rq(req)) { + if (!(ns->pi_type && ns->ms == 8) && + req->cmd_type != REQ_TYPE_DRV_PRIV) { req->errors = -EFAULT; blk_mq_complete_request(req); return BLK_MQ_RQ_QUEUE_OK; } } - iod = nvme_alloc_iod(req, ns->dev, GFP_ATOMIC); + iod = nvme_alloc_iod(req, dev, GFP_ATOMIC); if (!iod) return BLK_MQ_RQ_QUEUE_BUSY; @@ -841,8 +871,7 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, * as it is not worth having a special pool for these or * additional cases to handle freeing the iod. */ - range = dma_pool_alloc(nvmeq->dev->prp_small_pool, - GFP_ATOMIC, + range = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, &iod->first_dma); if (!range) goto retry_cmd; @@ -860,9 +889,8 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, goto retry_cmd; if (blk_rq_bytes(req) != - nvme_setup_prps(nvmeq->dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) { - dma_unmap_sg(&nvmeq->dev->pci_dev->dev, iod->sg, - iod->nents, dma_dir); + nvme_setup_prps(dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) { + dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); goto retry_cmd; } if (blk_integrity_rq(req)) { @@ -884,7 +912,9 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, nvme_set_info(cmd, iod, req_completion); spin_lock_irq(&nvmeq->q_lock); - if (req->cmd_flags & REQ_DISCARD) + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + nvme_submit_priv(nvmeq, req, iod); + else if (req->cmd_flags & REQ_DISCARD) nvme_submit_discard(nvmeq, ns, req, iod); else if (req->cmd_flags & REQ_FLUSH) nvme_submit_flush(nvmeq, ns, req->tag); @@ -896,10 +926,10 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, return BLK_MQ_RQ_QUEUE_OK; error_cmd: - nvme_free_iod(nvmeq->dev, iod); + nvme_free_iod(dev, iod); return BLK_MQ_RQ_QUEUE_ERROR; retry_cmd: - nvme_free_iod(nvmeq->dev, iod); + nvme_free_iod(dev, iod); return BLK_MQ_RQ_QUEUE_BUSY; } @@ -942,15 +972,6 @@ static int nvme_process_cq(struct nvme_queue *nvmeq) return 1; } -/* Admin queue isn't initialized as a request queue. If at some point this - * happens anyway, make sure to notify the user */ -static int nvme_admin_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *bd) -{ - WARN_ON_ONCE(1); - return BLK_MQ_RQ_QUEUE_ERROR; -} - static irqreturn_t nvme_irq(int irq, void *data) { irqreturn_t result; @@ -972,46 +993,61 @@ static irqreturn_t nvme_irq_check(int irq, void *data) return IRQ_WAKE_THREAD; } -struct sync_cmd_info { - struct task_struct *task; - u32 result; - int status; -}; - -static void sync_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - struct sync_cmd_info *cmdinfo = ctx; - cmdinfo->result = le32_to_cpup(&cqe->result); - cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; - wake_up_process(cmdinfo->task); -} - /* * Returns 0 on success. If the result is negative, it's a Linux error code; * if the result is positive, it's an NVM Express status code */ -static int nvme_submit_sync_cmd(struct request *req, struct nvme_command *cmd, - u32 *result, unsigned timeout) +int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buffer, void __user *ubuffer, unsigned bufflen, + u32 *result, unsigned timeout) { - struct sync_cmd_info cmdinfo; - struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = cmd_rq->nvmeq; + bool write = cmd->common.opcode & 1; + struct bio *bio = NULL; + struct request *req; + int ret; - cmdinfo.task = current; - cmdinfo.status = -EINTR; + req = blk_mq_alloc_request(q, write, GFP_KERNEL, false); + if (IS_ERR(req)) + return PTR_ERR(req); - cmd->common.command_id = req->tag; + req->cmd_type = REQ_TYPE_DRV_PRIV; + req->cmd_flags |= REQ_FAILFAST_DRIVER; + req->__data_len = 0; + req->__sector = (sector_t) -1; + req->bio = req->biotail = NULL; + + req->timeout = timeout ? timeout : ADMIN_TIMEOUT; - nvme_set_info(cmd_rq, &cmdinfo, sync_completion); + req->cmd = (unsigned char *)cmd; + req->cmd_len = sizeof(struct nvme_command); + req->special = (void *)0; - set_current_state(TASK_UNINTERRUPTIBLE); - nvme_submit_cmd(nvmeq, cmd); - schedule(); + if (buffer && bufflen) { + ret = blk_rq_map_kern(q, req, buffer, bufflen, __GFP_WAIT); + if (ret) + goto out; + } else if (ubuffer && bufflen) { + ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, __GFP_WAIT); + if (ret) + goto out; + bio = req->bio; + } + blk_execute_rq(req->q, NULL, req, 0); + if (bio) + blk_rq_unmap_user(bio); if (result) - *result = cmdinfo.result; - return cmdinfo.status; + *result = (u32)(uintptr_t)req->special; + ret = req->errors; + out: + blk_mq_free_request(req); + return ret; +} + +int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buffer, unsigned bufflen) +{ + return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0); } static int nvme_submit_async_admin_req(struct nvme_dev *dev) @@ -1033,7 +1069,7 @@ static int nvme_submit_async_admin_req(struct nvme_dev *dev) c.common.opcode = nvme_admin_async_event; c.common.command_id = req->tag; - blk_mq_free_hctx_request(nvmeq->hctx, req); + blk_mq_free_request(req); return __nvme_submit_cmd(nvmeq, &c); } @@ -1060,41 +1096,6 @@ static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, return nvme_submit_cmd(nvmeq, cmd); } -static int __nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, - u32 *result, unsigned timeout) -{ - int res; - struct request *req; - - req = blk_mq_alloc_request(dev->admin_q, WRITE, GFP_KERNEL, false); - if (IS_ERR(req)) - return PTR_ERR(req); - res = nvme_submit_sync_cmd(req, cmd, result, timeout); - blk_mq_free_request(req); - return res; -} - -int nvme_submit_admin_cmd(struct nvme_dev *dev, struct nvme_command *cmd, - u32 *result) -{ - return __nvme_submit_admin_cmd(dev, cmd, result, ADMIN_TIMEOUT); -} - -int nvme_submit_io_cmd(struct nvme_dev *dev, struct nvme_ns *ns, - struct nvme_command *cmd, u32 *result) -{ - int res; - struct request *req; - - req = blk_mq_alloc_request(ns->queue, WRITE, (GFP_KERNEL|__GFP_WAIT), - false); - if (IS_ERR(req)) - return PTR_ERR(req); - res = nvme_submit_sync_cmd(req, cmd, result, NVME_IO_TIMEOUT); - blk_mq_free_request(req); - return res; -} - static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) { struct nvme_command c; @@ -1103,7 +1104,7 @@ static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) c.delete_queue.opcode = opcode; c.delete_queue.qid = cpu_to_le16(id); - return nvme_submit_admin_cmd(dev, &c, NULL); + return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); } static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, @@ -1112,6 +1113,10 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, struct nvme_command c; int flags = NVME_QUEUE_PHYS_CONTIG | NVME_CQ_IRQ_ENABLED; + /* + * Note: we (ab)use the fact the the prp fields survive if no data + * is attached to the request. + */ memset(&c, 0, sizeof(c)); c.create_cq.opcode = nvme_admin_create_cq; c.create_cq.prp1 = cpu_to_le64(nvmeq->cq_dma_addr); @@ -1120,7 +1125,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, c.create_cq.cq_flags = cpu_to_le16(flags); c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); - return nvme_submit_admin_cmd(dev, &c, NULL); + return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); } static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, @@ -1129,6 +1134,10 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, struct nvme_command c; int flags = NVME_QUEUE_PHYS_CONTIG | NVME_SQ_PRIO_MEDIUM; + /* + * Note: we (ab)use the fact the the prp fields survive if no data + * is attached to the request. + */ memset(&c, 0, sizeof(c)); c.create_sq.opcode = nvme_admin_create_sq; c.create_sq.prp1 = cpu_to_le64(nvmeq->sq_dma_addr); @@ -1137,7 +1146,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, c.create_sq.sq_flags = cpu_to_le16(flags); c.create_sq.cqid = cpu_to_le16(qid); - return nvme_submit_admin_cmd(dev, &c, NULL); + return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); } static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) @@ -1150,18 +1159,43 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); } -int nvme_identify(struct nvme_dev *dev, unsigned nsid, unsigned cns, - dma_addr_t dma_addr) +int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id) { - struct nvme_command c; + struct nvme_command c = { + .identify.opcode = nvme_admin_identify, + .identify.cns = cpu_to_le32(1), + }; + int error; - memset(&c, 0, sizeof(c)); - c.identify.opcode = nvme_admin_identify; - c.identify.nsid = cpu_to_le32(nsid); - c.identify.prp1 = cpu_to_le64(dma_addr); - c.identify.cns = cpu_to_le32(cns); + *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); + if (!*id) + return -ENOMEM; - return nvme_submit_admin_cmd(dev, &c, NULL); + error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, + sizeof(struct nvme_id_ctrl)); + if (error) + kfree(*id); + return error; +} + +int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid, + struct nvme_id_ns **id) +{ + struct nvme_command c = { + .identify.opcode = nvme_admin_identify, + .identify.nsid = cpu_to_le32(nsid), + }; + int error; + + *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL); + if (!*id) + return -ENOMEM; + + error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, + sizeof(struct nvme_id_ns)); + if (error) + kfree(*id); + return error; } int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, @@ -1175,7 +1209,8 @@ int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, c.features.prp1 = cpu_to_le64(dma_addr); c.features.fid = cpu_to_le32(fid); - return nvme_submit_admin_cmd(dev, &c, result); + return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0, + result, 0); } int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, @@ -1189,7 +1224,30 @@ int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, c.features.fid = cpu_to_le32(fid); c.features.dword11 = cpu_to_le32(dword11); - return nvme_submit_admin_cmd(dev, &c, result); + return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0, + result, 0); +} + +int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log) +{ + struct nvme_command c = { + .common.opcode = nvme_admin_get_log_page, + .common.nsid = cpu_to_le32(0xFFFFFFFF), + .common.cdw10[0] = cpu_to_le32( + (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) | + NVME_LOG_SMART), + }; + int error; + + *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL); + if (!*log) + return -ENOMEM; + + error = nvme_submit_sync_cmd(dev->admin_q, &c, *log, + sizeof(struct nvme_smart_log)); + if (error) + kfree(*log); + return error; } /** @@ -1214,8 +1272,7 @@ static void nvme_abort_req(struct request *req) if (work_busy(&dev->reset_work)) goto out; list_del_init(&dev->node); - dev_warn(&dev->pci_dev->dev, - "I/O %d QID %d timeout, reset controller\n", + dev_warn(dev->dev, "I/O %d QID %d timeout, reset controller\n", req->tag, nvmeq->qid); dev->reset_workfn = nvme_reset_failed_dev; queue_work(nvme_workq, &dev->reset_work); @@ -1254,8 +1311,7 @@ static void nvme_abort_req(struct request *req) } } -static void nvme_cancel_queue_ios(struct blk_mq_hw_ctx *hctx, - struct request *req, void *data, bool reserved) +static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved) { struct nvme_queue *nvmeq = data; void *ctx; @@ -1352,11 +1408,9 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) static void nvme_clear_queue(struct nvme_queue *nvmeq) { - struct blk_mq_hw_ctx *hctx = nvmeq->hctx; - spin_lock_irq(&nvmeq->q_lock); - if (hctx && hctx->tags) - blk_mq_tag_busy_iter(hctx, nvme_cancel_queue_ios, nvmeq); + if (nvmeq->tags && *nvmeq->tags) + blk_mq_all_tag_busy_iter(*nvmeq->tags, nvme_cancel_queue_ios, nvmeq); spin_unlock_irq(&nvmeq->q_lock); } @@ -1384,22 +1438,21 @@ static void nvme_disable_queue(struct nvme_dev *dev, int qid) static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, int depth) { - struct device *dmadev = &dev->pci_dev->dev; struct nvme_queue *nvmeq = kzalloc(sizeof(*nvmeq), GFP_KERNEL); if (!nvmeq) return NULL; - nvmeq->cqes = dma_zalloc_coherent(dmadev, CQ_SIZE(depth), + nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth), &nvmeq->cq_dma_addr, GFP_KERNEL); if (!nvmeq->cqes) goto free_nvmeq; - nvmeq->sq_cmds = dma_alloc_coherent(dmadev, SQ_SIZE(depth), + nvmeq->sq_cmds = dma_alloc_coherent(dev->dev, SQ_SIZE(depth), &nvmeq->sq_dma_addr, GFP_KERNEL); if (!nvmeq->sq_cmds) goto free_cqdma; - nvmeq->q_dmadev = dmadev; + nvmeq->q_dmadev = dev->dev; nvmeq->dev = dev; snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", dev->instance, qid); @@ -1409,13 +1462,16 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, nvmeq->q_db = &dev->dbs[qid * 2 * dev->db_stride]; nvmeq->q_depth = depth; nvmeq->qid = qid; - dev->queue_count++; dev->queues[qid] = nvmeq; + /* make sure queue descriptor is set before queue count, for kthread */ + mb(); + dev->queue_count++; + return nvmeq; free_cqdma: - dma_free_coherent(dmadev, CQ_SIZE(depth), (void *)nvmeq->cqes, + dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); free_nvmeq: kfree(nvmeq); @@ -1487,7 +1543,7 @@ static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) if (fatal_signal_pending(current)) return -EINTR; if (time_after(jiffies, timeout)) { - dev_err(&dev->pci_dev->dev, + dev_err(dev->dev, "Device not ready; aborting %s\n", enabled ? "initialisation" : "reset"); return -ENODEV; @@ -1537,7 +1593,7 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev) if (fatal_signal_pending(current)) return -EINTR; if (time_after(jiffies, timeout)) { - dev_err(&dev->pci_dev->dev, + dev_err(dev->dev, "Device shutdown incomplete; abort shutdown\n"); return -ENODEV; } @@ -1547,10 +1603,9 @@ static int nvme_shutdown_ctrl(struct nvme_dev *dev) } static struct blk_mq_ops nvme_mq_admin_ops = { - .queue_rq = nvme_admin_queue_rq, + .queue_rq = nvme_queue_rq, .map_queue = blk_mq_map_queue, .init_hctx = nvme_admin_init_hctx, - .exit_hctx = nvme_exit_hctx, .init_request = nvme_admin_init_request, .timeout = nvme_timeout, }; @@ -1559,7 +1614,6 @@ static struct blk_mq_ops nvme_mq_ops = { .queue_rq = nvme_queue_rq, .map_queue = blk_mq_map_queue, .init_hctx = nvme_init_hctx, - .exit_hctx = nvme_exit_hctx, .init_request = nvme_init_request, .timeout = nvme_timeout, }; @@ -1580,7 +1634,7 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; dev->admin_tagset.reserved_tags = 1; dev->admin_tagset.timeout = ADMIN_TIMEOUT; - dev->admin_tagset.numa_node = dev_to_node(&dev->pci_dev->dev); + dev->admin_tagset.numa_node = dev_to_node(dev->dev); dev->admin_tagset.cmd_size = nvme_cmd_size(dev); dev->admin_tagset.driver_data = dev; @@ -1613,14 +1667,14 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) unsigned dev_page_max = NVME_CAP_MPSMAX(cap) + 12; if (page_shift < dev_page_min) { - dev_err(&dev->pci_dev->dev, + dev_err(dev->dev, "Minimum device page size (%u) too large for " "host (%u)\n", 1 << dev_page_min, 1 << page_shift); return -ENODEV; } if (page_shift > dev_page_max) { - dev_info(&dev->pci_dev->dev, + dev_info(dev->dev, "Device maximum page size (%u) smaller than " "host (%u); enabling work-around\n", 1 << dev_page_max, 1 << page_shift); @@ -1668,126 +1722,43 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) return result; } -struct nvme_iod *nvme_map_user_pages(struct nvme_dev *dev, int write, - unsigned long addr, unsigned length) -{ - int i, err, count, nents, offset; - struct scatterlist *sg; - struct page **pages; - struct nvme_iod *iod; - - if (addr & 3) - return ERR_PTR(-EINVAL); - if (!length || length > INT_MAX - PAGE_SIZE) - return ERR_PTR(-EINVAL); - - offset = offset_in_page(addr); - count = DIV_ROUND_UP(offset + length, PAGE_SIZE); - pages = kcalloc(count, sizeof(*pages), GFP_KERNEL); - if (!pages) - return ERR_PTR(-ENOMEM); - - err = get_user_pages_fast(addr, count, 1, pages); - if (err < count) { - count = err; - err = -EFAULT; - goto put_pages; - } - - err = -ENOMEM; - iod = __nvme_alloc_iod(count, length, dev, 0, GFP_KERNEL); - if (!iod) - goto put_pages; - - sg = iod->sg; - sg_init_table(sg, count); - for (i = 0; i < count; i++) { - sg_set_page(&sg[i], pages[i], - min_t(unsigned, length, PAGE_SIZE - offset), - offset); - length -= (PAGE_SIZE - offset); - offset = 0; - } - sg_mark_end(&sg[i - 1]); - iod->nents = count; - - nents = dma_map_sg(&dev->pci_dev->dev, sg, count, - write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - if (!nents) - goto free_iod; - - kfree(pages); - return iod; - - free_iod: - kfree(iod); - put_pages: - for (i = 0; i < count; i++) - put_page(pages[i]); - kfree(pages); - return ERR_PTR(err); -} - -void nvme_unmap_user_pages(struct nvme_dev *dev, int write, - struct nvme_iod *iod) -{ - int i; - - dma_unmap_sg(&dev->pci_dev->dev, iod->sg, iod->nents, - write ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - - for (i = 0; i < iod->nents; i++) - put_page(sg_page(&iod->sg[i])); -} - static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) { struct nvme_dev *dev = ns->dev; struct nvme_user_io io; struct nvme_command c; - unsigned length, meta_len, prp_len; + unsigned length, meta_len; int status, write; - struct nvme_iod *iod; dma_addr_t meta_dma = 0; void *meta = NULL; void __user *metadata; if (copy_from_user(&io, uio, sizeof(io))) return -EFAULT; - length = (io.nblocks + 1) << ns->lba_shift; - meta_len = (io.nblocks + 1) * ns->ms; - - if (meta_len && ((io.metadata & 3) || !io.metadata) && !ns->ext) - return -EINVAL; - else if (meta_len && ns->ext) { - length += meta_len; - meta_len = 0; - } - - metadata = (void __user *)(unsigned long)io.metadata; - - write = io.opcode & 1; switch (io.opcode) { case nvme_cmd_write: case nvme_cmd_read: case nvme_cmd_compare: - iod = nvme_map_user_pages(dev, write, io.addr, length); break; default: return -EINVAL; } - if (IS_ERR(iod)) - return PTR_ERR(iod); + length = (io.nblocks + 1) << ns->lba_shift; + meta_len = (io.nblocks + 1) * ns->ms; + metadata = (void __user *)(unsigned long)io.metadata; + write = io.opcode & 1; - prp_len = nvme_setup_prps(dev, iod, length, GFP_KERNEL); - if (length != prp_len) { - status = -ENOMEM; - goto unmap; + if (ns->ext) { + length += meta_len; + meta_len = 0; } if (meta_len) { - meta = dma_alloc_coherent(&dev->pci_dev->dev, meta_len, + if (((io.metadata & 3) || !io.metadata) && !ns->ext) + return -EINVAL; + + meta = dma_alloc_coherent(dev->dev, meta_len, &meta_dma, GFP_KERNEL); if (!meta) { @@ -1813,19 +1784,17 @@ static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) c.rw.reftag = cpu_to_le32(io.reftag); c.rw.apptag = cpu_to_le16(io.apptag); c.rw.appmask = cpu_to_le16(io.appmask); - c.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - c.rw.prp2 = cpu_to_le64(iod->first_dma); c.rw.metadata = cpu_to_le64(meta_dma); - status = nvme_submit_io_cmd(dev, ns, &c, NULL); + + status = __nvme_submit_sync_cmd(ns->queue, &c, NULL, + (void __user *)io.addr, length, NULL, 0); unmap: - nvme_unmap_user_pages(dev, write, iod); - nvme_free_iod(dev, iod); if (meta) { if (status == NVME_SC_SUCCESS && !write) { if (copy_to_user(metadata, meta, meta_len)) status = -EFAULT; } - dma_free_coherent(&dev->pci_dev->dev, meta_len, meta, meta_dma); + dma_free_coherent(dev->dev, meta_len, meta, meta_dma); } return status; } @@ -1835,9 +1804,8 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, { struct nvme_passthru_cmd cmd; struct nvme_command c; - int status, length; - struct nvme_iod *uninitialized_var(iod); - unsigned timeout; + unsigned timeout = 0; + int status; if (!capable(CAP_SYS_ADMIN)) return -EACCES; @@ -1857,46 +1825,17 @@ static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); - length = cmd.data_len; - if (cmd.data_len) { - iod = nvme_map_user_pages(dev, cmd.opcode & 1, cmd.addr, - length); - if (IS_ERR(iod)) - return PTR_ERR(iod); - length = nvme_setup_prps(dev, iod, length, GFP_KERNEL); - c.common.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - c.common.prp2 = cpu_to_le64(iod->first_dma); - } - - timeout = cmd.timeout_ms ? msecs_to_jiffies(cmd.timeout_ms) : - ADMIN_TIMEOUT; - - if (length != cmd.data_len) - status = -ENOMEM; - else if (ns) { - struct request *req; - - req = blk_mq_alloc_request(ns->queue, WRITE, - (GFP_KERNEL|__GFP_WAIT), false); - if (IS_ERR(req)) - status = PTR_ERR(req); - else { - status = nvme_submit_sync_cmd(req, &c, &cmd.result, - timeout); - blk_mq_free_request(req); - } - } else - status = __nvme_submit_admin_cmd(dev, &c, &cmd.result, timeout); + if (cmd.timeout_ms) + timeout = msecs_to_jiffies(cmd.timeout_ms); - if (cmd.data_len) { - nvme_unmap_user_pages(dev, cmd.opcode & 1, iod); - nvme_free_iod(dev, iod); + status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c, + NULL, (void __user *)cmd.addr, cmd.data_len, + &cmd.result, timeout); + if (status >= 0) { + if (put_user(cmd.result, &ucmd->result)) + return -EFAULT; } - if ((status >= 0) && copy_to_user(&ucmd->result, &cmd.result, - sizeof(cmd.result))) - status = -EFAULT; - return status; } @@ -1988,23 +1927,18 @@ static int nvme_revalidate_disk(struct gendisk *disk) struct nvme_ns *ns = disk->private_data; struct nvme_dev *dev = ns->dev; struct nvme_id_ns *id; - dma_addr_t dma_addr; u8 lbaf, pi_type; u16 old_ms; unsigned short bs; - id = dma_alloc_coherent(&dev->pci_dev->dev, 4096, &dma_addr, - GFP_KERNEL); - if (!id) { - dev_warn(&dev->pci_dev->dev, "%s: Memory alocation failure\n", - __func__); - return 0; + if (nvme_identify_ns(dev, ns->ns_id, &id)) { + dev_warn(dev->dev, "%s: Identify failure nvme%dn%d\n", __func__, + dev->instance, ns->ns_id); + return -ENODEV; } - if (nvme_identify(dev, ns->ns_id, 0, dma_addr)) { - dev_warn(&dev->pci_dev->dev, - "identify failed ns:%d, setting capacity to 0\n", - ns->ns_id); - memset(id, 0, sizeof(*id)); + if (id->ncap == 0) { + kfree(id); + return -ENODEV; } old_ms = ns->ms; @@ -2038,7 +1972,7 @@ static int nvme_revalidate_disk(struct gendisk *disk) !ns->ext) nvme_init_integrity(ns); - if (id->ncap == 0 || (ns->ms && !blk_get_integrity(disk))) + if (ns->ms && !blk_get_integrity(disk)) set_capacity(disk, 0); else set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); @@ -2046,7 +1980,7 @@ static int nvme_revalidate_disk(struct gendisk *disk) if (dev->oncs & NVME_CTRL_ONCS_DSM) nvme_config_discard(ns); - dma_free_coherent(&dev->pci_dev->dev, 4096, id, dma_addr); + kfree(id); return 0; } @@ -2073,7 +2007,7 @@ static int nvme_kthread(void *data) if (work_busy(&dev->reset_work)) continue; list_del_init(&dev->node); - dev_warn(&dev->pci_dev->dev, + dev_warn(dev->dev, "Failed status: %x, reset controller\n", readl(&dev->bar->csts)); dev->reset_workfn = nvme_reset_failed_dev; @@ -2105,7 +2039,7 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) { struct nvme_ns *ns; struct gendisk *disk; - int node = dev_to_node(&dev->pci_dev->dev); + int node = dev_to_node(dev->dev); ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); if (!ns) @@ -2153,11 +2087,16 @@ static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) * requires it. */ set_capacity(disk, 0); - nvme_revalidate_disk(ns->disk); + if (nvme_revalidate_disk(ns->disk)) + goto out_free_disk; + add_disk(ns->disk); if (ns->ms) revalidate_disk(ns->disk); return; + out_free_disk: + kfree(disk); + list_del(&ns->list); out_free_queue: blk_cleanup_queue(ns->queue); out_free_ns: @@ -2188,8 +2127,7 @@ static int set_queue_count(struct nvme_dev *dev, int count) if (status < 0) return status; if (status > 0) { - dev_err(&dev->pci_dev->dev, "Could not set queue count (%d)\n", - status); + dev_err(dev->dev, "Could not set queue count (%d)\n", status); return 0; } return min(result & 0xffff, result >> 16) + 1; @@ -2203,7 +2141,7 @@ static size_t db_bar_size(struct nvme_dev *dev, unsigned nr_io_queues) static int nvme_setup_io_queues(struct nvme_dev *dev) { struct nvme_queue *adminq = dev->queues[0]; - struct pci_dev *pdev = dev->pci_dev; + struct pci_dev *pdev = to_pci_dev(dev->dev); int result, i, vecs, nr_io_queues, size; nr_io_queues = num_possible_cpus(); @@ -2275,6 +2213,99 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) return result; } +static void nvme_free_namespace(struct nvme_ns *ns) +{ + list_del(&ns->list); + + spin_lock(&dev_list_lock); + ns->disk->private_data = NULL; + spin_unlock(&dev_list_lock); + + put_disk(ns->disk); + kfree(ns); +} + +static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); + struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); + + return nsa->ns_id - nsb->ns_id; +} + +static struct nvme_ns *nvme_find_ns(struct nvme_dev *dev, unsigned nsid) +{ + struct nvme_ns *ns; + + list_for_each_entry(ns, &dev->namespaces, list) { + if (ns->ns_id == nsid) + return ns; + if (ns->ns_id > nsid) + break; + } + return NULL; +} + +static inline bool nvme_io_incapable(struct nvme_dev *dev) +{ + return (!dev->bar || readl(&dev->bar->csts) & NVME_CSTS_CFS || + dev->online_queues < 2); +} + +static void nvme_ns_remove(struct nvme_ns *ns) +{ + bool kill = nvme_io_incapable(ns->dev) && !blk_queue_dying(ns->queue); + + if (kill) + blk_set_queue_dying(ns->queue); + if (ns->disk->flags & GENHD_FL_UP) { + if (blk_get_integrity(ns->disk)) + blk_integrity_unregister(ns->disk); + del_gendisk(ns->disk); + } + if (kill || !blk_queue_dying(ns->queue)) { + blk_mq_abort_requeue_list(ns->queue); + blk_cleanup_queue(ns->queue); + } +} + +static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn) +{ + struct nvme_ns *ns, *next; + unsigned i; + + for (i = 1; i <= nn; i++) { + ns = nvme_find_ns(dev, i); + if (ns) { + if (revalidate_disk(ns->disk)) { + nvme_ns_remove(ns); + nvme_free_namespace(ns); + } + } else + nvme_alloc_ns(dev, i); + } + list_for_each_entry_safe(ns, next, &dev->namespaces, list) { + if (ns->ns_id > nn) { + nvme_ns_remove(ns); + nvme_free_namespace(ns); + } + } + list_sort(NULL, &dev->namespaces, ns_cmp); +} + +static void nvme_dev_scan(struct work_struct *work) +{ + struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work); + struct nvme_id_ctrl *ctrl; + + if (!dev->tagset.tags) + return; + if (nvme_identify_ctrl(dev, &ctrl)) + return; + nvme_scan_namespaces(dev, le32_to_cpup(&ctrl->nn)); + kfree(ctrl); +} + /* * Return: error value if an error occurred setting up the queues or calling * Identify Device. 0 if these succeeded, even if adding some of the @@ -2283,26 +2314,18 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) */ static int nvme_dev_add(struct nvme_dev *dev) { - struct pci_dev *pdev = dev->pci_dev; + struct pci_dev *pdev = to_pci_dev(dev->dev); int res; - unsigned nn, i; + unsigned nn; struct nvme_id_ctrl *ctrl; - void *mem; - dma_addr_t dma_addr; int shift = NVME_CAP_MPSMIN(readq(&dev->bar->cap)) + 12; - mem = dma_alloc_coherent(&pdev->dev, 4096, &dma_addr, GFP_KERNEL); - if (!mem) - return -ENOMEM; - - res = nvme_identify(dev, 0, 1, dma_addr); + res = nvme_identify_ctrl(dev, &ctrl); if (res) { - dev_err(&pdev->dev, "Identify Controller failed (%d)\n", res); - dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr); + dev_err(dev->dev, "Identify Controller failed (%d)\n", res); return -EIO; } - ctrl = mem; nn = le32_to_cpup(&ctrl->nn); dev->oncs = le16_to_cpup(&ctrl->oncs); dev->abort_limit = ctrl->acl + 1; @@ -2324,12 +2347,12 @@ static int nvme_dev_add(struct nvme_dev *dev) } else dev->max_hw_sectors = max_hw_sectors; } - dma_free_coherent(&dev->pci_dev->dev, 4096, mem, dma_addr); + kfree(ctrl); dev->tagset.ops = &nvme_mq_ops; dev->tagset.nr_hw_queues = dev->online_queues - 1; dev->tagset.timeout = NVME_IO_TIMEOUT; - dev->tagset.numa_node = dev_to_node(&dev->pci_dev->dev); + dev->tagset.numa_node = dev_to_node(dev->dev); dev->tagset.queue_depth = min_t(int, dev->q_depth, BLK_MQ_MAX_DEPTH) - 1; dev->tagset.cmd_size = nvme_cmd_size(dev); @@ -2339,9 +2362,7 @@ static int nvme_dev_add(struct nvme_dev *dev) if (blk_mq_alloc_tag_set(&dev->tagset)) return 0; - for (i = 1; i <= nn; i++) - nvme_alloc_ns(dev, i); - + schedule_work(&dev->scan_work); return 0; } @@ -2349,7 +2370,7 @@ static int nvme_dev_map(struct nvme_dev *dev) { u64 cap; int bars, result = -ENOMEM; - struct pci_dev *pdev = dev->pci_dev; + struct pci_dev *pdev = to_pci_dev(dev->dev); if (pci_enable_device_mem(pdev)) return result; @@ -2363,8 +2384,8 @@ static int nvme_dev_map(struct nvme_dev *dev) if (pci_request_selected_regions(pdev, bars, "nvme")) goto disable_pci; - if (dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(64)) && - dma_set_mask_and_coherent(&pdev->dev, DMA_BIT_MASK(32))) + if (dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(64)) && + dma_set_mask_and_coherent(dev->dev, DMA_BIT_MASK(32))) goto disable; dev->bar = ioremap(pci_resource_start(pdev, 0), 8192); @@ -2405,19 +2426,21 @@ static int nvme_dev_map(struct nvme_dev *dev) static void nvme_dev_unmap(struct nvme_dev *dev) { - if (dev->pci_dev->msi_enabled) - pci_disable_msi(dev->pci_dev); - else if (dev->pci_dev->msix_enabled) - pci_disable_msix(dev->pci_dev); + struct pci_dev *pdev = to_pci_dev(dev->dev); + + if (pdev->msi_enabled) + pci_disable_msi(pdev); + else if (pdev->msix_enabled) + pci_disable_msix(pdev); if (dev->bar) { iounmap(dev->bar); dev->bar = NULL; - pci_release_regions(dev->pci_dev); + pci_release_regions(pdev); } - if (pci_is_enabled(dev->pci_dev)) - pci_disable_device(dev->pci_dev); + if (pci_is_enabled(pdev)) + pci_disable_device(pdev); } struct nvme_delq_ctx { @@ -2536,7 +2559,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev) &worker, "nvme%d", dev->instance); if (IS_ERR(kworker_task)) { - dev_err(&dev->pci_dev->dev, + dev_err(dev->dev, "Failed to create queue del task\n"); for (i = dev->queue_count - 1; i > 0; i--) nvme_disable_queue(dev, i); @@ -2587,9 +2610,9 @@ static void nvme_freeze_queues(struct nvme_dev *dev) list_for_each_entry(ns, &dev->namespaces, list) { blk_mq_freeze_queue_start(ns->queue); - spin_lock(ns->queue->queue_lock); + spin_lock_irq(ns->queue->queue_lock); queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); - spin_unlock(ns->queue->queue_lock); + spin_unlock_irq(ns->queue->queue_lock); blk_mq_cancel_requeue_work(ns->queue); blk_mq_stop_hw_queues(ns->queue); @@ -2639,29 +2662,19 @@ static void nvme_dev_remove(struct nvme_dev *dev) { struct nvme_ns *ns; - list_for_each_entry(ns, &dev->namespaces, list) { - if (ns->disk->flags & GENHD_FL_UP) { - if (blk_get_integrity(ns->disk)) - blk_integrity_unregister(ns->disk); - del_gendisk(ns->disk); - } - if (!blk_queue_dying(ns->queue)) { - blk_mq_abort_requeue_list(ns->queue); - blk_cleanup_queue(ns->queue); - } - } + list_for_each_entry(ns, &dev->namespaces, list) + nvme_ns_remove(ns); } static int nvme_setup_prp_pools(struct nvme_dev *dev) { - struct device *dmadev = &dev->pci_dev->dev; - dev->prp_page_pool = dma_pool_create("prp list page", dmadev, + dev->prp_page_pool = dma_pool_create("prp list page", dev->dev, PAGE_SIZE, PAGE_SIZE, 0); if (!dev->prp_page_pool) return -ENOMEM; /* Optimisation for I/Os between 4k and 128k */ - dev->prp_small_pool = dma_pool_create("prp list 256", dmadev, + dev->prp_small_pool = dma_pool_create("prp list 256", dev->dev, 256, 256, 0); if (!dev->prp_small_pool) { dma_pool_destroy(dev->prp_page_pool); @@ -2709,23 +2722,15 @@ static void nvme_free_namespaces(struct nvme_dev *dev) { struct nvme_ns *ns, *next; - list_for_each_entry_safe(ns, next, &dev->namespaces, list) { - list_del(&ns->list); - - spin_lock(&dev_list_lock); - ns->disk->private_data = NULL; - spin_unlock(&dev_list_lock); - - put_disk(ns->disk); - kfree(ns); - } + list_for_each_entry_safe(ns, next, &dev->namespaces, list) + nvme_free_namespace(ns); } static void nvme_free_dev(struct kref *kref) { struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); - pci_dev_put(dev->pci_dev); + put_device(dev->dev); put_device(dev->device); nvme_free_namespaces(dev); nvme_release_instance(dev); @@ -2781,6 +2786,9 @@ static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) return -ENOTTY; ns = list_first_entry(&dev->namespaces, struct nvme_ns, list); return nvme_user_cmd(dev, ns, (void __user *)arg); + case NVME_IOCTL_RESET: + dev_warn(dev->dev, "resetting controller\n"); + return nvme_reset(dev); default: return -ENOTTY; } @@ -2802,11 +2810,11 @@ static void nvme_set_irq_hints(struct nvme_dev *dev) for (i = 0; i < dev->online_queues; i++) { nvmeq = dev->queues[i]; - if (!nvmeq->hctx) + if (!nvmeq->tags || !(*nvmeq->tags)) continue; irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, - nvmeq->hctx->cpumask); + blk_mq_tags_cpumask(*nvmeq->tags)); } } @@ -2869,7 +2877,7 @@ static int nvme_dev_start(struct nvme_dev *dev) static int nvme_remove_dead_ctrl(void *arg) { struct nvme_dev *dev = (struct nvme_dev *)arg; - struct pci_dev *pdev = dev->pci_dev; + struct pci_dev *pdev = to_pci_dev(dev->dev); if (pci_get_drvdata(pdev)) pci_stop_and_remove_bus_device_locked(pdev); @@ -2899,6 +2907,7 @@ static int nvme_dev_resume(struct nvme_dev *dev) spin_unlock(&dev_list_lock); } else { nvme_unfreeze_queues(dev); + schedule_work(&dev->scan_work); nvme_set_irq_hints(dev); } return 0; @@ -2908,11 +2917,11 @@ static void nvme_dev_reset(struct nvme_dev *dev) { nvme_dev_shutdown(dev); if (nvme_dev_resume(dev)) { - dev_warn(&dev->pci_dev->dev, "Device failed to resume\n"); + dev_warn(dev->dev, "Device failed to resume\n"); kref_get(&dev->kref); if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", dev->instance))) { - dev_err(&dev->pci_dev->dev, + dev_err(dev->dev, "Failed to start controller remove task\n"); kref_put(&dev->kref, nvme_free_dev); } @@ -2931,6 +2940,44 @@ static void nvme_reset_workfn(struct work_struct *work) dev->reset_workfn(work); } +static int nvme_reset(struct nvme_dev *dev) +{ + int ret = -EBUSY; + + if (!dev->admin_q || blk_queue_dying(dev->admin_q)) + return -ENODEV; + + spin_lock(&dev_list_lock); + if (!work_pending(&dev->reset_work)) { + dev->reset_workfn = nvme_reset_failed_dev; + queue_work(nvme_workq, &dev->reset_work); + ret = 0; + } + spin_unlock(&dev_list_lock); + + if (!ret) { + flush_work(&dev->reset_work); + return 0; + } + + return ret; +} + +static ssize_t nvme_sysfs_reset(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct nvme_dev *ndev = dev_get_drvdata(dev); + int ret; + + ret = nvme_reset(ndev); + if (ret < 0) + return ret; + + return count; +} +static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); + static void nvme_async_probe(struct work_struct *work); static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { @@ -2956,7 +3003,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_LIST_HEAD(&dev->namespaces); dev->reset_workfn = nvme_reset_failed_dev; INIT_WORK(&dev->reset_work, nvme_reset_workfn); - dev->pci_dev = pci_dev_get(pdev); + dev->dev = get_device(&pdev->dev); pci_set_drvdata(pdev, dev); result = nvme_set_instance(dev); if (result) @@ -2975,18 +3022,27 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) goto release_pools; } get_device(dev->device); + dev_set_drvdata(dev->device, dev); + + result = device_create_file(dev->device, &dev_attr_reset_controller); + if (result) + goto put_dev; INIT_LIST_HEAD(&dev->node); + INIT_WORK(&dev->scan_work, nvme_dev_scan); INIT_WORK(&dev->probe_work, nvme_async_probe); schedule_work(&dev->probe_work); return 0; + put_dev: + device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); + put_device(dev->device); release_pools: nvme_release_prp_pools(dev); release: nvme_release_instance(dev); put_pci: - pci_dev_put(dev->pci_dev); + put_device(dev->dev); free: kfree(dev->queues); kfree(dev->entry); @@ -3011,10 +3067,12 @@ static void nvme_async_probe(struct work_struct *work) nvme_set_irq_hints(dev); return; reset: + spin_lock(&dev_list_lock); if (!work_busy(&dev->reset_work)) { dev->reset_workfn = nvme_reset_failed_dev; queue_work(nvme_workq, &dev->reset_work); } + spin_unlock(&dev_list_lock); } static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) @@ -3044,6 +3102,8 @@ static void nvme_remove(struct pci_dev *pdev) pci_set_drvdata(pdev, NULL); flush_work(&dev->probe_work); flush_work(&dev->reset_work); + flush_work(&dev->scan_work); + device_remove_file(dev->device, &dev_attr_reset_controller); nvme_dev_shutdown(dev); nvme_dev_remove(dev); nvme_dev_remove_admin(dev); |