From 79c48ccf2fec7c10105bd635d3bb1128167b1258 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 14 Jan 2018 12:39:00 +0200 Subject: nvme-pci: serialize pci resets Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 3 ++- drivers/nvme/host/nvme.h | 1 + drivers/nvme/host/pci.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 2fe15351ac4e..4d8f63b3c5b6 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -95,7 +95,7 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl) } EXPORT_SYMBOL_GPL(nvme_reset_ctrl); -static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) +int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) { int ret; @@ -104,6 +104,7 @@ static int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl) flush_work(&ctrl->reset_work); return ret; } +EXPORT_SYMBOL_GPL(nvme_reset_ctrl_sync); static void nvme_delete_ctrl_work(struct work_struct *work) { diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 4112fb6ce80d..77faf2049917 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -394,6 +394,7 @@ int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); void nvme_start_keep_alive(struct nvme_ctrl *ctrl); void nvme_stop_keep_alive(struct nvme_ctrl *ctrl); int nvme_reset_ctrl(struct nvme_ctrl *ctrl); +int nvme_reset_ctrl_sync(struct nvme_ctrl *ctrl); int nvme_delete_ctrl(struct nvme_ctrl *ctrl); int nvme_delete_ctrl_sync(struct nvme_ctrl *ctrl); diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 62119078c2bf..dc9a4cf7c1d1 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2537,7 +2537,7 @@ static void nvme_reset_prepare(struct pci_dev *pdev) static void nvme_reset_done(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - nvme_reset_ctrl(&dev->ctrl); + nvme_reset_ctrl_sync(&dev->ctrl); } static void nvme_shutdown(struct pci_dev *pdev) -- cgit v1.2.3 From 147b27e4bd08406a6abebedbb478b431ec197be1 Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 14 Jan 2018 12:39:01 +0200 Subject: nvme-pci: allocate device queues storage space at probe It may cause race by setting 'nvmeq' in nvme_init_request() because .init_request is called inside switching io scheduler, which may happen when the NVMe device is being resetted and its nvme queues are being freed and created. We don't have any sync between the two pathes. This patch changes the nvmeq allocation to occur at probe time so there is no way we can dereference it at init_request. [ 93.268391] kernel BUG at drivers/nvme/host/pci.c:408! [ 93.274146] invalid opcode: 0000 [#1] SMP [ 93.278618] Modules linked in: nfsv3 nfs_acl rpcsec_gss_krb5 auth_rpcgss nfsv4 dns_resolver nfs lockd grace fscache sunrpc ipmi_ssif vfat fat intel_rapl sb_edac x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel iTCO_wdt intel_cstate ipmi_si iTCO_vendor_support intel_uncore mxm_wmi mei_me ipmi_devintf intel_rapl_perf pcspkr sg ipmi_msghandler lpc_ich dcdbas mei shpchp acpi_power_meter wmi dm_multipath ip_tables xfs libcrc32c sd_mod mgag200 i2c_algo_bit drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm ahci libahci nvme libata crc32c_intel nvme_core tg3 megaraid_sas ptp i2c_core pps_core dm_mirror dm_region_hash dm_log dm_mod [ 93.349071] CPU: 5 PID: 1842 Comm: sh Not tainted 4.15.0-rc2.ming+ #4 [ 93.356256] Hardware name: Dell Inc. PowerEdge R730xd/072T6D, BIOS 2.5.5 08/16/2017 [ 93.364801] task: 00000000fb8abf2a task.stack: 0000000028bd82d1 [ 93.371408] RIP: 0010:nvme_init_request+0x36/0x40 [nvme] [ 93.377333] RSP: 0018:ffffc90002537ca8 EFLAGS: 00010246 [ 93.383161] RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000008 [ 93.391122] RDX: 0000000000000000 RSI: ffff880276ae0000 RDI: ffff88047bae9008 [ 93.399084] RBP: ffff88047bae9008 R08: ffff88047bae9008 R09: 0000000009dabc00 [ 93.407045] R10: 0000000000000004 R11: 000000000000299c R12: ffff880186bc1f00 [ 93.415007] R13: ffff880276ae0000 R14: 0000000000000000 R15: 0000000000000071 [ 93.422969] FS: 00007f33cf288740(0000) GS:ffff88047ba80000(0000) knlGS:0000000000000000 [ 93.431996] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 93.438407] CR2: 00007f33cf28e000 CR3: 000000047e5bb006 CR4: 00000000001606e0 [ 93.446368] Call Trace: [ 93.449103] blk_mq_alloc_rqs+0x231/0x2a0 [ 93.453579] blk_mq_sched_alloc_tags.isra.8+0x42/0x80 [ 93.459214] blk_mq_init_sched+0x7e/0x140 [ 93.463687] elevator_switch+0x5a/0x1f0 [ 93.467966] ? elevator_get.isra.17+0x52/0xc0 [ 93.472826] elv_iosched_store+0xde/0x150 [ 93.477299] queue_attr_store+0x4e/0x90 [ 93.481580] kernfs_fop_write+0xfa/0x180 [ 93.485958] __vfs_write+0x33/0x170 [ 93.489851] ? __inode_security_revalidate+0x4c/0x60 [ 93.495390] ? selinux_file_permission+0xda/0x130 [ 93.500641] ? _cond_resched+0x15/0x30 [ 93.504815] vfs_write+0xad/0x1a0 [ 93.508512] SyS_write+0x52/0xc0 [ 93.512113] do_syscall_64+0x61/0x1a0 [ 93.516199] entry_SYSCALL64_slow_path+0x25/0x25 [ 93.521351] RIP: 0033:0x7f33ce96aab0 [ 93.525337] RSP: 002b:00007ffe57570238 EFLAGS: 00000246 ORIG_RAX: 0000000000000001 [ 93.533785] RAX: ffffffffffffffda RBX: 0000000000000006 RCX: 00007f33ce96aab0 [ 93.541746] RDX: 0000000000000006 RSI: 00007f33cf28e000 RDI: 0000000000000001 [ 93.549707] RBP: 00007f33cf28e000 R08: 000000000000000a R09: 00007f33cf288740 [ 93.557669] R10: 00007f33cf288740 R11: 0000000000000246 R12: 00007f33cec42400 [ 93.565630] R13: 0000000000000006 R14: 0000000000000001 R15: 0000000000000000 [ 93.573592] Code: 4c 8d 40 08 4c 39 c7 74 16 48 8b 00 48 8b 04 08 48 85 c0 74 16 48 89 86 78 01 00 00 31 c0 c3 8d 4a 01 48 63 c9 48 c1 e1 03 eb de <0f> 0b 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 85 f6 53 48 89 [ 93.594676] RIP: nvme_init_request+0x36/0x40 [nvme] RSP: ffffc90002537ca8 [ 93.602273] ---[ end trace 810dde3993e5f14e ]--- Reported-by: Yi Zhang Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 63 ++++++++++++++++++++----------------------------- 1 file changed, 26 insertions(+), 37 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index dc9a4cf7c1d1..b058b1e9b5bb 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -75,7 +75,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); * Represents an NVM Express device. Each nvme_dev is a PCI function. */ struct nvme_dev { - struct nvme_queue **queues; + struct nvme_queue *queues; struct blk_mq_tag_set tagset; struct blk_mq_tag_set admin_tagset; u32 __iomem *dbs; @@ -365,7 +365,7 @@ static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { struct nvme_dev *dev = data; - struct nvme_queue *nvmeq = dev->queues[0]; + struct nvme_queue *nvmeq = &dev->queues[0]; WARN_ON(hctx_idx != 0); WARN_ON(dev->admin_tagset.tags[0] != hctx->tags); @@ -387,7 +387,7 @@ static int nvme_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, unsigned int hctx_idx) { struct nvme_dev *dev = data; - struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; + struct nvme_queue *nvmeq = &dev->queues[hctx_idx + 1]; if (!nvmeq->tags) nvmeq->tags = &dev->tagset.tags[hctx_idx]; @@ -403,7 +403,7 @@ static int nvme_init_request(struct blk_mq_tag_set *set, struct request *req, struct nvme_dev *dev = set->driver_data; struct nvme_iod *iod = blk_mq_rq_to_pdu(req); int queue_idx = (set == &dev->tagset) ? hctx_idx + 1 : 0; - struct nvme_queue *nvmeq = dev->queues[queue_idx]; + struct nvme_queue *nvmeq = &dev->queues[queue_idx]; BUG_ON(!nvmeq); iod->nvmeq = nvmeq; @@ -1046,7 +1046,7 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) static void nvme_pci_submit_async_event(struct nvme_ctrl *ctrl) { struct nvme_dev *dev = to_nvme_dev(ctrl); - struct nvme_queue *nvmeq = dev->queues[0]; + struct nvme_queue *nvmeq = &dev->queues[0]; struct nvme_command c; memset(&c, 0, sizeof(c)); @@ -1282,7 +1282,6 @@ static void nvme_free_queue(struct nvme_queue *nvmeq) if (nvmeq->sq_cmds) dma_free_coherent(nvmeq->q_dmadev, SQ_SIZE(nvmeq->q_depth), nvmeq->sq_cmds, nvmeq->sq_dma_addr); - kfree(nvmeq); } static void nvme_free_queues(struct nvme_dev *dev, int lowest) @@ -1290,10 +1289,8 @@ static void nvme_free_queues(struct nvme_dev *dev, int lowest) int i; for (i = dev->ctrl.queue_count - 1; i >= lowest; i--) { - struct nvme_queue *nvmeq = dev->queues[i]; dev->ctrl.queue_count--; - dev->queues[i] = NULL; - nvme_free_queue(nvmeq); + nvme_free_queue(&dev->queues[i]); } } @@ -1325,10 +1322,8 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) { - struct nvme_queue *nvmeq = dev->queues[0]; + struct nvme_queue *nvmeq = &dev->queues[0]; - if (!nvmeq) - return; if (nvme_suspend_queue(nvmeq)) return; @@ -1384,13 +1379,10 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, return 0; } -static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, - int depth, int node) +static int nvme_alloc_queue(struct nvme_dev *dev, int qid, + int depth, int node) { - struct nvme_queue *nvmeq = kzalloc_node(sizeof(*nvmeq), GFP_KERNEL, - node); - if (!nvmeq) - return NULL; + struct nvme_queue *nvmeq = &dev->queues[qid]; nvmeq->cqes = dma_zalloc_coherent(dev->dev, CQ_SIZE(depth), &nvmeq->cq_dma_addr, GFP_KERNEL); @@ -1409,17 +1401,15 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, nvmeq->q_depth = depth; nvmeq->qid = qid; nvmeq->cq_vector = -1; - dev->queues[qid] = nvmeq; dev->ctrl.queue_count++; - return nvmeq; + return 0; free_cqdma: dma_free_coherent(dev->dev, CQ_SIZE(depth), (void *)nvmeq->cqes, nvmeq->cq_dma_addr); free_nvmeq: - kfree(nvmeq); - return NULL; + return -ENOMEM; } static int queue_request_irq(struct nvme_queue *nvmeq) @@ -1592,14 +1582,12 @@ static int nvme_pci_configure_admin_queue(struct nvme_dev *dev) if (result < 0) return result; - nvmeq = dev->queues[0]; - if (!nvmeq) { - nvmeq = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, - dev_to_node(dev->dev)); - if (!nvmeq) - return -ENOMEM; - } + result = nvme_alloc_queue(dev, 0, NVME_AQ_DEPTH, + dev_to_node(dev->dev)); + if (result) + return result; + nvmeq = &dev->queues[0]; aqa = nvmeq->q_depth - 1; aqa |= aqa << 16; @@ -1629,7 +1617,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev) for (i = dev->ctrl.queue_count; i <= dev->max_qid; i++) { /* vector == qid - 1, match nvme_create_queue */ - if (!nvme_alloc_queue(dev, i, dev->q_depth, + if (nvme_alloc_queue(dev, i, dev->q_depth, pci_irq_get_node(to_pci_dev(dev->dev), i - 1))) { ret = -ENOMEM; break; @@ -1638,7 +1626,7 @@ static int nvme_create_io_queues(struct nvme_dev *dev) max = min(dev->max_qid, dev->ctrl.queue_count - 1); for (i = dev->online_queues; i <= max; i++) { - ret = nvme_create_queue(dev->queues[i], i); + ret = nvme_create_queue(&dev->queues[i], i); if (ret) break; } @@ -1894,7 +1882,7 @@ static int nvme_setup_host_mem(struct nvme_dev *dev) static int nvme_setup_io_queues(struct nvme_dev *dev) { - struct nvme_queue *adminq = dev->queues[0]; + struct nvme_queue *adminq = &dev->queues[0]; struct pci_dev *pdev = to_pci_dev(dev->dev); int result, nr_io_queues; unsigned long size; @@ -2020,7 +2008,7 @@ static void nvme_disable_io_queues(struct nvme_dev *dev, int queues) retry: timeout = ADMIN_TIMEOUT; for (; i > 0; i--, sent++) - if (nvme_delete_queue(dev->queues[i], opcode)) + if (nvme_delete_queue(&dev->queues[i], opcode)) break; while (sent--) { @@ -2212,7 +2200,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) queues = dev->online_queues - 1; for (i = dev->ctrl.queue_count - 1; i > 0; i--) - nvme_suspend_queue(dev->queues[i]); + nvme_suspend_queue(&dev->queues[i]); if (dead) { /* A device might become IO incapable very soon during @@ -2220,7 +2208,7 @@ static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) * queue_count can be 0 here. */ if (dev->ctrl.queue_count) - nvme_suspend_queue(dev->queues[0]); + nvme_suspend_queue(&dev->queues[0]); } else { nvme_disable_io_queues(dev, queues); nvme_disable_admin_queue(dev, shutdown); @@ -2482,8 +2470,9 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) dev = kzalloc_node(sizeof(*dev), GFP_KERNEL, node); if (!dev) return -ENOMEM; - dev->queues = kzalloc_node((num_possible_cpus() + 1) * sizeof(void *), - GFP_KERNEL, node); + + dev->queues = kcalloc_node(num_possible_cpus() + 1, + sizeof(struct nvme_queue), GFP_KERNEL, node); if (!dev->queues) goto free; -- cgit v1.2.3 From b227c59b9b5b8ae52639c8980af853d2f654f90a Mon Sep 17 00:00:00 2001 From: Roy Shterman Date: Sun, 14 Jan 2018 12:39:02 +0200 Subject: nvme: host delete_work and reset_work on separate workqueues We need to ensure that delete_work will be hosted on a different workqueue than all the works we flush or cancel from it. Otherwise we may hit a circular dependency warning [1]. Also, given that delete_work flushes reset_work, host reset_work on nvme_reset_wq and delete_work on nvme_delete_wq. In addition, fix the flushing in the individual drivers to flush nvme_delete_wq when draining queued deletes. [1]: [ 178.491942] ============================================= [ 178.492718] [ INFO: possible recursive locking detected ] [ 178.493495] 4.9.0-rc4-c844263313a8-lb #3 Tainted: G OE [ 178.494382] --------------------------------------------- [ 178.495160] kworker/5:1/135 is trying to acquire lock: [ 178.495894] ( [ 178.496120] "nvme-wq" [ 178.496471] ){++++.+} [ 178.496599] , at: [ 178.496921] [] flush_work+0x1a6/0x2d0 [ 178.497670] but task is already holding lock: [ 178.498499] ( [ 178.498724] "nvme-wq" [ 178.499074] ){++++.+} [ 178.499202] , at: [ 178.499520] [] process_one_work+0x162/0x6a0 [ 178.500343] other info that might help us debug this: [ 178.501269] Possible unsafe locking scenario: [ 178.502113] CPU0 [ 178.502472] ---- [ 178.502829] lock( [ 178.503115] "nvme-wq" [ 178.503467] ); [ 178.503716] lock( [ 178.504001] "nvme-wq" [ 178.504353] ); [ 178.504601] *** DEADLOCK *** [ 178.505441] May be due to missing lock nesting notation [ 178.506453] 2 locks held by kworker/5:1/135: [ 178.507068] #0: [ 178.507330] ( [ 178.507598] "nvme-wq" [ 178.507726] ){++++.+} [ 178.508079] , at: [ 178.508173] [] process_one_work+0x162/0x6a0 [ 178.509004] #1: [ 178.509265] ( [ 178.509532] (&ctrl->delete_work) [ 178.509795] ){+.+.+.} [ 178.510145] , at: [ 178.510239] [] process_one_work+0x162/0x6a0 [ 178.511070] stack backtrace: : [ 178.511693] CPU: 5 PID: 135 Comm: kworker/5:1 Tainted: G OE 4.9.0-rc4-c844263313a8-lb #3 [ 178.512974] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.1-1ubuntu1 04/01/2014 [ 178.514247] Workqueue: nvme-wq nvme_del_ctrl_work [nvme_tcp] [ 178.515071] ffffc2668175bae0 ffffffffa7450823 ffffffffa88abd80 ffffffffa88abd80 [ 178.516195] ffffc2668175bb98 ffffffffa70eb012 ffffffffa8d8d90d ffff9c472e9ea700 [ 178.517318] ffff9c472e9ea700 ffff9c4700000000 ffff9c4700007200 ab83be61bec0d50e [ 178.518443] Call Trace: [ 178.518807] [] dump_stack+0x85/0xc2 [ 178.519542] [] __lock_acquire+0x17d2/0x18f0 [ 178.520377] [] ? serial8250_console_putchar+0x27/0x30 [ 178.521330] [] ? wait_for_xmitr+0xa0/0xa0 [ 178.522174] [] ? flush_work+0x18b/0x2d0 [ 178.522975] [] lock_acquire+0x11b/0x220 [ 178.523753] [] ? flush_work+0x1a6/0x2d0 [ 178.524535] [] flush_work+0x1c9/0x2d0 [ 178.525291] [] ? flush_work+0x1a6/0x2d0 [ 178.526077] [] ? flush_workqueue_prep_pwqs+0x220/0x220 [ 178.527040] [] __cancel_work_timer+0x10f/0x1d0 [ 178.527907] [] ? vprintk_default+0x29/0x40 [ 178.528726] [] ? printk+0x48/0x50 [ 178.529434] [] cancel_delayed_work_sync+0x13/0x20 [ 178.530381] [] nvme_stop_ctrl+0x5b/0x70 [nvme_core] [ 178.531314] [] nvme_del_ctrl_work+0x2c/0x50 [nvme_tcp] [ 178.532271] [] process_one_work+0x1e1/0x6a0 [ 178.533101] [] ? process_one_work+0x162/0x6a0 [ 178.533954] [] worker_thread+0x4e/0x490 [ 178.534735] [] ? process_one_work+0x6a0/0x6a0 [ 178.535588] [] ? process_one_work+0x6a0/0x6a0 [ 178.536441] [] kthread+0xff/0x120 [ 178.537149] [] ? kthread_park+0x60/0x60 [ 178.538094] [] ? kthread_park+0x60/0x60 [ 178.538900] [] ret_from_fork+0x2a/0x40 Signed-off-by: Roy Shterman Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 44 +++++++++++++++++++++++++++++++++++++++----- drivers/nvme/host/nvme.h | 2 ++ drivers/nvme/host/rdma.c | 2 +- drivers/nvme/target/loop.c | 2 +- 4 files changed, 43 insertions(+), 7 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 4d8f63b3c5b6..fde6fd2e7eef 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -65,9 +65,26 @@ static bool streams; module_param(streams, bool, 0644); MODULE_PARM_DESC(streams, "turn on support for Streams write directives"); +/* + * nvme_wq - hosts nvme related works that are not reset or delete + * nvme_reset_wq - hosts nvme reset works + * nvme_delete_wq - hosts nvme delete works + * + * nvme_wq will host works such are scan, aen handling, fw activation, + * keep-alive error recovery, periodic reconnects etc. nvme_reset_wq + * runs reset works which also flush works hosted on nvme_wq for + * serialization purposes. nvme_delete_wq host controller deletion + * works which flush reset works for serialization. + */ struct workqueue_struct *nvme_wq; EXPORT_SYMBOL_GPL(nvme_wq); +struct workqueue_struct *nvme_reset_wq; +EXPORT_SYMBOL_GPL(nvme_reset_wq); + +struct workqueue_struct *nvme_delete_wq; +EXPORT_SYMBOL_GPL(nvme_delete_wq); + static DEFINE_IDA(nvme_subsystems_ida); static LIST_HEAD(nvme_subsystems); static DEFINE_MUTEX(nvme_subsystems_lock); @@ -89,7 +106,7 @@ int nvme_reset_ctrl(struct nvme_ctrl *ctrl) { if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_RESETTING)) return -EBUSY; - if (!queue_work(nvme_wq, &ctrl->reset_work)) + if (!queue_work(nvme_reset_wq, &ctrl->reset_work)) return -EBUSY; return 0; } @@ -123,7 +140,7 @@ int nvme_delete_ctrl(struct nvme_ctrl *ctrl) { if (!nvme_change_ctrl_state(ctrl, NVME_CTRL_DELETING)) return -EBUSY; - if (!queue_work(nvme_wq, &ctrl->delete_work)) + if (!queue_work(nvme_delete_wq, &ctrl->delete_work)) return -EBUSY; return 0; } @@ -3526,16 +3543,26 @@ EXPORT_SYMBOL_GPL(nvme_reinit_tagset); int __init nvme_core_init(void) { - int result; + int result = -ENOMEM; nvme_wq = alloc_workqueue("nvme-wq", WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); if (!nvme_wq) - return -ENOMEM; + goto out; + + nvme_reset_wq = alloc_workqueue("nvme-reset-wq", + WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); + if (!nvme_reset_wq) + goto destroy_wq; + + nvme_delete_wq = alloc_workqueue("nvme-delete-wq", + WQ_UNBOUND | WQ_MEM_RECLAIM | WQ_SYSFS, 0); + if (!nvme_delete_wq) + goto destroy_reset_wq; result = alloc_chrdev_region(&nvme_chr_devt, 0, NVME_MINORS, "nvme"); if (result < 0) - goto destroy_wq; + goto destroy_delete_wq; nvme_class = class_create(THIS_MODULE, "nvme"); if (IS_ERR(nvme_class)) { @@ -3554,8 +3581,13 @@ destroy_class: class_destroy(nvme_class); unregister_chrdev: unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); +destroy_delete_wq: + destroy_workqueue(nvme_delete_wq); +destroy_reset_wq: + destroy_workqueue(nvme_reset_wq); destroy_wq: destroy_workqueue(nvme_wq); +out: return result; } @@ -3565,6 +3597,8 @@ void nvme_core_exit(void) class_destroy(nvme_subsys_class); class_destroy(nvme_class); unregister_chrdev_region(nvme_chr_devt, NVME_MINORS); + destroy_workqueue(nvme_delete_wq); + destroy_workqueue(nvme_reset_wq); destroy_workqueue(nvme_wq); } diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 77faf2049917..8e7fc1b041b7 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -32,6 +32,8 @@ extern unsigned int admin_timeout; #define NVME_KATO_GRACE 10 extern struct workqueue_struct *nvme_wq; +extern struct workqueue_struct *nvme_reset_wq; +extern struct workqueue_struct *nvme_delete_wq; enum { NVME_NS_LBA = 0, diff --git a/drivers/nvme/host/rdma.c b/drivers/nvme/host/rdma.c index 75d6956eb380..38e183461d9d 100644 --- a/drivers/nvme/host/rdma.c +++ b/drivers/nvme/host/rdma.c @@ -2029,7 +2029,7 @@ static void nvme_rdma_remove_one(struct ib_device *ib_device, void *client_data) } mutex_unlock(&nvme_rdma_ctrl_mutex); - flush_workqueue(nvme_wq); + flush_workqueue(nvme_delete_wq); } static struct ib_client nvme_rdma_ib_client = { diff --git a/drivers/nvme/target/loop.c b/drivers/nvme/target/loop.c index fdfcc961029f..7991ec3a17db 100644 --- a/drivers/nvme/target/loop.c +++ b/drivers/nvme/target/loop.c @@ -717,7 +717,7 @@ static void __exit nvme_loop_cleanup_module(void) nvme_delete_ctrl(&ctrl->ctrl); mutex_unlock(&nvme_loop_ctrl_mutex); - flush_workqueue(nvme_wq); + flush_workqueue(nvme_delete_wq); } module_init(nvme_loop_init_module); -- cgit v1.2.3 From 8adb8c147b2f6383a1676325c27e3dbc29d2fba7 Mon Sep 17 00:00:00 2001 From: Minwoo Im Date: Sun, 14 Jan 2018 16:14:27 +0900 Subject: nvme: fix comment typos in nvme_create_io_queues fix comment typos in nvme_create_io_queues() like below. _aount_ to _amount_ _an_ to _can_ Signed-off-by: Minwoo Im Reviewed-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/host/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index b058b1e9b5bb..13057aee84e6 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1633,8 +1633,8 @@ static int nvme_create_io_queues(struct nvme_dev *dev) /* * Ignore failing Create SQ/CQ commands, we can continue with less - * than the desired aount of queues, and even a controller without - * I/O queues an still be used to issue admin commands. This might + * than the desired amount of queues, and even a controller without + * I/O queues can still be used to issue admin commands. This might * be useful to upgrade a buggy firmware for example. */ return ret >= 0 ? 0 : ret; -- cgit v1.2.3 From df351ef73789345b4b6c00434c5fd1fca7175643 Mon Sep 17 00:00:00 2001 From: Roland Dreier Date: Thu, 11 Jan 2018 13:38:00 -0800 Subject: nvme-fabrics: fix memory leak when parsing host ID option We use match_strdup() to get a copy of the option string for host ID string, but we just pass it to uuid_parse() and don't store the string pointer, so we need to kfree() the string after parsing it. Signed-off-by: Roland Dreier Reviewed-by: Sagi Grimberg Reviewed-by: Johannes Thumshirn Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/fabrics.c b/drivers/nvme/host/fabrics.c index 2f68befd31bf..eb46967bb0d5 100644 --- a/drivers/nvme/host/fabrics.c +++ b/drivers/nvme/host/fabrics.c @@ -738,7 +738,9 @@ static int nvmf_parse_options(struct nvmf_ctrl_options *opts, ret = -ENOMEM; goto out; } - if (uuid_parse(p, &hostid)) { + ret = uuid_parse(p, &hostid); + kfree(p); + if (ret) { pr_err("Invalid hostid %s\n", p); ret = -EINVAL; goto out; -- cgit v1.2.3 From 423b4487fb23cc9bcbf14f748915bff46151506a Mon Sep 17 00:00:00 2001 From: Sagi Grimberg Date: Sun, 14 Jan 2018 18:34:22 +0200 Subject: nvmet: release a ns reference in nvmet_req_uninit if needed nvmet_req_init looked up a namespace and took a reference on it (unless it failed prior to that). If the request is uninitialized (in error cases) we need to remove that reference in case it was taken, otherwise we leak namespace reference when calling nvme_req_uninit. Signed-off-by: Sagi Grimberg Signed-off-by: Christoph Hellwig --- drivers/nvme/target/core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/target/core.c b/drivers/nvme/target/core.c index 7282ea8d3b96..0bd737117a80 100644 --- a/drivers/nvme/target/core.c +++ b/drivers/nvme/target/core.c @@ -512,6 +512,7 @@ bool nvmet_req_init(struct nvmet_req *req, struct nvmet_cq *cq, req->sg_cnt = 0; req->transfer_len = 0; req->rsp->status = 0; + req->ns = NULL; /* no support for fused commands yet */ if (unlikely(flags & (NVME_CMD_FUSE_FIRST | NVME_CMD_FUSE_SECOND))) { @@ -557,6 +558,8 @@ EXPORT_SYMBOL_GPL(nvmet_req_init); void nvmet_req_uninit(struct nvmet_req *req) { percpu_ref_put(&req->sq->ref); + if (req->ns) + nvmet_put_namespace(req->ns); } EXPORT_SYMBOL_GPL(nvmet_req_uninit); -- cgit v1.2.3 From d625d05ef0f0914a706d824fab85472a42be6659 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 11 Jan 2018 14:29:22 -0800 Subject: nvme-fc: fix rogue admin cmds stalling teardown When connectivity is lost to a device, the association is terminated and the blk-mq queues are quiesced/stopped. When connectivity is re-established, they are resumed. If an admin command is received while connectivity is list, the ioctl queues the command on the admin_q and the command stalls (the thread issuing the ioctl hangs/waits). if the connectivity is lost long enough such that the controller is then deleted, the delete code makes its calls to initiate the delete, which then expects the core layer to call the transport when all references are removed and the controller can be freed. Unfortunately, nothing in this path dequeued the admin command, so a reference sits outstanding and things stop, hanging the delete indefinitely. Correct by unquiescing the admin queue in the delete association. This means any admin command (which should only be from an ioctl) issued after connectivity is lost will detect the controller is in a reconnecting state and will (fast) fail the command. Thus, a pending reference can no longer be created. Once connectivity is re-established, a new ioctl/admin command would see proper device state and function again. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index 2a7a9a75105d..a10c77139f76 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2921,6 +2921,9 @@ nvme_fc_delete_association(struct nvme_fc_ctrl *ctrl) __nvme_fc_delete_hw_queue(ctrl, &ctrl->queues[0], 0); nvme_fc_free_queue(&ctrl->queues[0]); + /* re-enable the admin_q so anything new can fast fail */ + blk_mq_unquiesce_queue(ctrl->ctrl.admin_q); + nvme_fc_ctlr_inactive_on_rport(ctrl); } -- cgit v1.2.3 From 0fd997d3f77296522e836f7002e8a0636c9886aa Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 11 Jan 2018 15:21:38 -0800 Subject: nvme-fc: correct hang in nvme_ns_remove() When connectivity is lost to a device, the association is terminated and the blk-mq queues are quiesced/stopped. When connectivity is re-established, they are resumed. If connectivity is lost for a sufficient amount of time that the controller is then deleted, the delete path starts tearing down queues, and eventually calling nvme_ns_remove(). It appears that pending commands may cause blk_cleanup_queue() to never complete and the teardown stalls. Correct by starting the ns queues after transitioning to a DELETING state, allowing pending commands to be flushed with io failures. Thus the delete path is clear when reached. Signed-off-by: James Smart Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/nvme/host/fc.c b/drivers/nvme/host/fc.c index a10c77139f76..b76ba4629e02 100644 --- a/drivers/nvme/host/fc.c +++ b/drivers/nvme/host/fc.c @@ -2938,6 +2938,9 @@ nvme_fc_delete_ctrl(struct nvme_ctrl *nctrl) * waiting for io to terminate */ nvme_fc_delete_association(ctrl); + + /* resume the io queues so that things will fast fail */ + nvme_start_queues(nctrl); } static void -- cgit v1.2.3 From f65efd6dfe4e687637704f7023157fdee99913ca Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Dec 2017 14:25:11 +0100 Subject: nvme-pci: clean up CMB initialization Refactor the call to nvme_map_cmb, and change the conditions for probing for the CMB. First remove the version check as NVMe TPs always apply to earlier versions of the spec as well. Second check for the whole CMBSZ register for support of the CMB feature instead of just the size field inside of it to simplify the code a bit. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Logan Gunthorpe --- drivers/nvme/host/pci.c | 41 ++++++++++++++--------------------------- 1 file changed, 14 insertions(+), 27 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 13057aee84e6..edb57e984865 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1651,21 +1651,20 @@ static ssize_t nvme_cmb_show(struct device *dev, } static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); -static void __iomem *nvme_map_cmb(struct nvme_dev *dev) +static void nvme_map_cmb(struct nvme_dev *dev) { u64 szu, size, offset; resource_size_t bar_size; struct pci_dev *pdev = to_pci_dev(dev->dev); - void __iomem *cmb; int bar; dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); - if (!(NVME_CMB_SZ(dev->cmbsz))) - return NULL; + if (!dev->cmbsz) + return; dev->cmbloc = readl(dev->bar + NVME_REG_CMBLOC); if (!use_cmb_sqes) - return NULL; + return; szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); size = szu * NVME_CMB_SZ(dev->cmbsz); @@ -1674,7 +1673,7 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev) bar_size = pci_resource_len(pdev, bar); if (offset > bar_size) - return NULL; + return; /* * Controllers may support a CMB size larger than their BAR, @@ -1684,13 +1683,16 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev) if (size > bar_size - offset) size = bar_size - offset; - cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size); - if (!cmb) - return NULL; - + dev->cmb = ioremap_wc(pci_resource_start(pdev, bar) + offset, size); + if (!dev->cmb) + return; dev->cmb_bus_addr = pci_bus_address(pdev, bar) + offset; dev->cmb_size = size; - return cmb; + + if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, + &dev_attr_cmb.attr, NULL)) + dev_warn(dev->ctrl.device, + "failed to add sysfs attribute for CMB\n"); } static inline void nvme_release_cmb(struct nvme_dev *dev) @@ -2115,22 +2117,7 @@ static int nvme_pci_enable(struct nvme_dev *dev) "set queue depth=%u\n", dev->q_depth); } - /* - * CMBs can currently only exist on >=1.2 PCIe devices. We only - * populate sysfs if a CMB is implemented. Since nvme_dev_attrs_group - * has no name we can pass NULL as final argument to - * sysfs_add_file_to_group. - */ - - if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2, 0)) { - dev->cmb = nvme_map_cmb(dev); - if (dev->cmb) { - if (sysfs_add_file_to_group(&dev->ctrl.device->kobj, - &dev_attr_cmb.attr, NULL)) - dev_warn(dev->ctrl.device, - "failed to add sysfs attribute for CMB\n"); - } - } + nvme_map_cmb(dev); pci_enable_pcie_error_reporting(pdev); pci_save_state(pdev); -- cgit v1.2.3 From 88de4598bca84e27b261685c06fff816b8d932a1 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Wed, 20 Dec 2017 14:50:00 +0100 Subject: nvme-pci: clean up SMBSZ bit definitions Define the bit positions instead of macros using the magic values, and move the expanded helpers to calculate the size and size unit into the implementation C file. Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Reviewed-by: Logan Gunthorpe --- drivers/nvme/host/pci.c | 23 +++++++++++++++++------ include/linux/nvme.h | 22 ++++++++++++++-------- 2 files changed, 31 insertions(+), 14 deletions(-) diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index edb57e984865..a2ffb557b616 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -1364,7 +1364,7 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, int qid, int depth) { - if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { + if (qid && dev->cmb && use_cmb_sqes && (dev->cmbsz & NVME_CMBSZ_SQS)) { unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth), dev->ctrl.page_size); nvmeq->sq_dma_addr = dev->cmb_bus_addr + offset; @@ -1651,9 +1651,21 @@ static ssize_t nvme_cmb_show(struct device *dev, } static DEVICE_ATTR(cmb, S_IRUGO, nvme_cmb_show, NULL); +static u64 nvme_cmb_size_unit(struct nvme_dev *dev) +{ + u8 szu = (dev->cmbsz >> NVME_CMBSZ_SZU_SHIFT) & NVME_CMBSZ_SZU_MASK; + + return 1ULL << (12 + 4 * szu); +} + +static u32 nvme_cmb_size(struct nvme_dev *dev) +{ + return (dev->cmbsz >> NVME_CMBSZ_SZ_SHIFT) & NVME_CMBSZ_SZ_MASK; +} + static void nvme_map_cmb(struct nvme_dev *dev) { - u64 szu, size, offset; + u64 size, offset; resource_size_t bar_size; struct pci_dev *pdev = to_pci_dev(dev->dev); int bar; @@ -1666,9 +1678,8 @@ static void nvme_map_cmb(struct nvme_dev *dev) if (!use_cmb_sqes) return; - szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); - size = szu * NVME_CMB_SZ(dev->cmbsz); - offset = szu * NVME_CMB_OFST(dev->cmbloc); + size = nvme_cmb_size_unit(dev) * nvme_cmb_size(dev); + offset = nvme_cmb_size_unit(dev) * NVME_CMB_OFST(dev->cmbloc); bar = NVME_CMB_BIR(dev->cmbloc); bar_size = pci_resource_len(pdev, bar); @@ -1897,7 +1908,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) if (nr_io_queues == 0) return 0; - if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { + if (dev->cmb && (dev->cmbsz & NVME_CMBSZ_SQS)) { result = nvme_cmb_qdepth(dev, nr_io_queues, sizeof(struct nvme_command)); if (result > 0) diff --git a/include/linux/nvme.h b/include/linux/nvme.h index aea87f0d917b..4112e2bd747f 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -124,14 +124,20 @@ enum { #define NVME_CMB_BIR(cmbloc) ((cmbloc) & 0x7) #define NVME_CMB_OFST(cmbloc) (((cmbloc) >> 12) & 0xfffff) -#define NVME_CMB_SZ(cmbsz) (((cmbsz) >> 12) & 0xfffff) -#define NVME_CMB_SZU(cmbsz) (((cmbsz) >> 8) & 0xf) - -#define NVME_CMB_WDS(cmbsz) ((cmbsz) & 0x10) -#define NVME_CMB_RDS(cmbsz) ((cmbsz) & 0x8) -#define NVME_CMB_LISTS(cmbsz) ((cmbsz) & 0x4) -#define NVME_CMB_CQS(cmbsz) ((cmbsz) & 0x2) -#define NVME_CMB_SQS(cmbsz) ((cmbsz) & 0x1) + +enum { + NVME_CMBSZ_SQS = 1 << 0, + NVME_CMBSZ_CQS = 1 << 1, + NVME_CMBSZ_LISTS = 1 << 2, + NVME_CMBSZ_RDS = 1 << 3, + NVME_CMBSZ_WDS = 1 << 4, + + NVME_CMBSZ_SZ_SHIFT = 12, + NVME_CMBSZ_SZ_MASK = 0xfffff, + + NVME_CMBSZ_SZU_SHIFT = 8, + NVME_CMBSZ_SZU_MASK = 0xf, +}; /* * Submission and Completion Queue Entry Sizes for the NVM command set. -- cgit v1.2.3