From 9ec041ea40dbc6425c0ee6ae15786e5ab1d6aad6 Mon Sep 17 00:00:00 2001 From: Luca Boccassi Date: Fri, 10 Feb 2023 01:06:12 +0000 Subject: sed-opal: add support flag for SUM in status ioctl Not every OPAL drive supports SUM (Single User Mode), so report this information to userspace via the get-status ioctl so that we can adjust the formatting options accordingly. Tested on a kingston drive (which supports it) and a samsung one (which does not). Signed-off-by: Luca Boccassi Link: https://lore.kernel.org/r/20230210010612.28729-1-luca.boccassi@gmail.com Signed-off-by: Jens Axboe --- block/sed-opal.c | 2 ++ include/uapi/linux/sed-opal.h | 1 + 2 files changed, 3 insertions(+) diff --git a/block/sed-opal.c b/block/sed-opal.c index 463873f61e01..c320093c14f1 100644 --- a/block/sed-opal.c +++ b/block/sed-opal.c @@ -487,6 +487,8 @@ static int opal_discovery0_end(struct opal_dev *dev) break; case FC_SINGLEUSER: single_user = check_sum(body->features); + if (single_user) + dev->flags |= OPAL_FL_SUM_SUPPORTED; break; case FC_GEOMETRY: check_geometry(dev, body); diff --git a/include/uapi/linux/sed-opal.h b/include/uapi/linux/sed-opal.h index 1fed3c9294fc..d7a1524023db 100644 --- a/include/uapi/linux/sed-opal.h +++ b/include/uapi/linux/sed-opal.h @@ -144,6 +144,7 @@ struct opal_read_write_table { #define OPAL_FL_LOCKED 0x00000008 #define OPAL_FL_MBR_ENABLED 0x00000010 #define OPAL_FL_MBR_DONE 0x00000020 +#define OPAL_FL_SUM_SUPPORTED 0x00000040 struct opal_status { __u32 flags; -- cgit v1.2.3 From 0f77b29ad14e34a89961f32edc87b92db623bb37 Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 17 Feb 2023 10:21:59 +0800 Subject: block: Revert "block: Do not reread partition table on exclusively open device" This reverts commit 36369f46e91785688a5f39d7a5590e3f07981316. This patch can't fix the problem in a corner case that device can be opened exclusively after the checking and before blkdev_get_by_dev(). We'll use a new solution to fix the problem in the next patch, and the new solution doesn't need to change apis. Signed-off-by: Yu Kuai Acked-by: Jan Kara Link: https://lore.kernel.org/r/20230217022200.3092987-2-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/blk.h | 2 +- block/genhd.c | 7 ++----- block/ioctl.c | 13 ++++++------- 3 files changed, 9 insertions(+), 13 deletions(-) diff --git a/block/blk.h b/block/blk.h index 4c3b3325219a..e835f21d48af 100644 --- a/block/blk.h +++ b/block/blk.h @@ -427,7 +427,7 @@ int bio_add_hw_page(struct request_queue *q, struct bio *bio, struct request_queue *blk_alloc_queue(int node_id); -int disk_scan_partitions(struct gendisk *disk, fmode_t mode, void *owner); +int disk_scan_partitions(struct gendisk *disk, fmode_t mode); int disk_alloc_events(struct gendisk *disk); void disk_add_events(struct gendisk *disk); diff --git a/block/genhd.c b/block/genhd.c index 093ef292e98f..820e27d88cbf 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -356,7 +356,7 @@ void disk_uevent(struct gendisk *disk, enum kobject_action action) } EXPORT_SYMBOL_GPL(disk_uevent); -int disk_scan_partitions(struct gendisk *disk, fmode_t mode, void *owner) +int disk_scan_partitions(struct gendisk *disk, fmode_t mode) { struct block_device *bdev; @@ -366,9 +366,6 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode, void *owner) return -EINVAL; if (disk->open_partitions) return -EBUSY; - /* Someone else has bdev exclusively open? */ - if (disk->part0->bd_holder && disk->part0->bd_holder != owner) - return -EBUSY; set_bit(GD_NEED_PART_SCAN, &disk->state); bdev = blkdev_get_by_dev(disk_devt(disk), mode, NULL); @@ -499,7 +496,7 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, bdev_add(disk->part0, ddev->devt); if (get_capacity(disk)) - disk_scan_partitions(disk, FMODE_READ, NULL); + disk_scan_partitions(disk, FMODE_READ); /* * Announce the disk and partitions after all partitions are diff --git a/block/ioctl.c b/block/ioctl.c index 96617512982e..6dd49d877584 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -467,10 +467,10 @@ static int blkdev_bszset(struct block_device *bdev, fmode_t mode, * user space. Note the separate arg/argp parameters that are needed * to deal with the compat_ptr() conversion. */ -static int blkdev_common_ioctl(struct file *file, fmode_t mode, unsigned cmd, - unsigned long arg, void __user *argp) +static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg, + void __user *argp) { - struct block_device *bdev = I_BDEV(file->f_mapping->host); unsigned int max_sectors; switch (cmd) { @@ -528,8 +528,7 @@ static int blkdev_common_ioctl(struct file *file, fmode_t mode, unsigned cmd, return -EACCES; if (bdev_is_partition(bdev)) return -EINVAL; - return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL, - file); + return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACETEARDOWN: @@ -607,7 +606,7 @@ long blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) break; } - ret = blkdev_common_ioctl(file, mode, cmd, arg, argp); + ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); if (ret != -ENOIOCTLCMD) return ret; @@ -676,7 +675,7 @@ long compat_blkdev_ioctl(struct file *file, unsigned cmd, unsigned long arg) break; } - ret = blkdev_common_ioctl(file, mode, cmd, arg, argp); + ret = blkdev_common_ioctl(bdev, mode, cmd, arg, argp); if (ret == -ENOIOCTLCMD && disk->fops->compat_ioctl) ret = disk->fops->compat_ioctl(bdev, mode, cmd, arg); -- cgit v1.2.3 From e5cfefa97bccf956ea0bb6464c1f6c84fd7a8d9f Mon Sep 17 00:00:00 2001 From: Yu Kuai Date: Fri, 17 Feb 2023 10:22:00 +0800 Subject: block: fix scan partition for exclusively open device again As explained in commit 36369f46e917 ("block: Do not reread partition table on exclusively open device"), reread partition on the device that is exclusively opened by someone else is problematic. This patch will make sure partition scan will only be proceed if current thread open the device exclusively, or the device is not opened exclusively, and in the later case, other scanners and exclusive openers will be blocked temporarily until partition scan is done. Fixes: 10c70d95c0f2 ("block: remove the bd_openers checks in blk_drop_partitions") Cc: Suggested-by: Jan Kara Signed-off-by: Yu Kuai Reviewed-by: Christoph Hellwig Link: https://lore.kernel.org/r/20230217022200.3092987-3-yukuai1@huaweicloud.com Signed-off-by: Jens Axboe --- block/genhd.c | 30 ++++++++++++++++++++++++++---- block/ioctl.c | 2 +- 2 files changed, 27 insertions(+), 5 deletions(-) diff --git a/block/genhd.c b/block/genhd.c index 820e27d88cbf..c0a73cd76bb1 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -359,6 +359,7 @@ EXPORT_SYMBOL_GPL(disk_uevent); int disk_scan_partitions(struct gendisk *disk, fmode_t mode) { struct block_device *bdev; + int ret = 0; if (disk->flags & (GENHD_FL_NO_PART | GENHD_FL_HIDDEN)) return -EINVAL; @@ -368,11 +369,27 @@ int disk_scan_partitions(struct gendisk *disk, fmode_t mode) return -EBUSY; set_bit(GD_NEED_PART_SCAN, &disk->state); - bdev = blkdev_get_by_dev(disk_devt(disk), mode, NULL); + /* + * If the device is opened exclusively by current thread already, it's + * safe to scan partitons, otherwise, use bd_prepare_to_claim() to + * synchronize with other exclusive openers and other partition + * scanners. + */ + if (!(mode & FMODE_EXCL)) { + ret = bd_prepare_to_claim(disk->part0, disk_scan_partitions); + if (ret) + return ret; + } + + bdev = blkdev_get_by_dev(disk_devt(disk), mode & ~FMODE_EXCL, NULL); if (IS_ERR(bdev)) - return PTR_ERR(bdev); - blkdev_put(bdev, mode); - return 0; + ret = PTR_ERR(bdev); + else + blkdev_put(bdev, mode); + + if (!(mode & FMODE_EXCL)) + bd_abort_claiming(disk->part0, disk_scan_partitions); + return ret; } /** @@ -494,6 +511,11 @@ int __must_check device_add_disk(struct device *parent, struct gendisk *disk, if (ret) goto out_unregister_bdi; + /* Make sure the first partition scan will be proceed */ + if (get_capacity(disk) && !(disk->flags & GENHD_FL_NO_PART) && + !test_bit(GD_SUPPRESS_PART_SCAN, &disk->state)) + set_bit(GD_NEED_PART_SCAN, &disk->state); + bdev_add(disk->part0, ddev->devt); if (get_capacity(disk)) disk_scan_partitions(disk, FMODE_READ); diff --git a/block/ioctl.c b/block/ioctl.c index 6dd49d877584..9c5f637ff153 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -528,7 +528,7 @@ static int blkdev_common_ioctl(struct block_device *bdev, fmode_t mode, return -EACCES; if (bdev_is_partition(bdev)) return -EINVAL; - return disk_scan_partitions(bdev->bd_disk, mode & ~FMODE_EXCL); + return disk_scan_partitions(bdev->bd_disk, mode); case BLKTRACESTART: case BLKTRACESTOP: case BLKTRACETEARDOWN: -- cgit v1.2.3 From d88cbbb39b4db057feb1552de31f22c02a21b36f Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Fri, 17 Feb 2023 10:29:10 +0100 Subject: blk-mq: Reorder fields in 'struct blk_mq_tag_set' Group some variables based on their sizes to reduce hole and avoid padding. On x86_64, this shrinks the size of 'struct blk_mq_tag_set' from 304 to 296 bytes. Signed-off-by: Christophe JAILLET Link: https://lore.kernel.org/r/6f249f9b02a3490283ef0278096556de41aa0cf0.1676626130.git.christophe.jaillet@wanadoo.fr Signed-off-by: Jens Axboe --- include/linux/blk-mq.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/blk-mq.h b/include/linux/blk-mq.h index 779fba613bd0..dd5ce1137f04 100644 --- a/include/linux/blk-mq.h +++ b/include/linux/blk-mq.h @@ -473,6 +473,7 @@ enum hctx_type { /** * struct blk_mq_tag_set - tag set that can be shared between request queues + * @ops: Pointers to functions that implement block driver behavior. * @map: One or more ctx -> hctx mappings. One map exists for each * hardware queue type (enum hctx_type) that the driver wishes * to support. There are no restrictions on maps being of the @@ -480,7 +481,6 @@ enum hctx_type { * types. * @nr_maps: Number of elements in the @map array. A number in the range * [1, HCTX_MAX_TYPES]. - * @ops: Pointers to functions that implement block driver behavior. * @nr_hw_queues: Number of hardware queues supported by the block driver that * owns this data structure. * @queue_depth: Number of tags per hardware queue, reserved tags included. @@ -505,9 +505,9 @@ enum hctx_type { * (BLK_MQ_F_BLOCKING). */ struct blk_mq_tag_set { + const struct blk_mq_ops *ops; struct blk_mq_queue_map map[HCTX_MAX_TYPES]; unsigned int nr_maps; - const struct blk_mq_ops *ops; unsigned int nr_hw_queues; unsigned int queue_depth; unsigned int reserved_tags; -- cgit v1.2.3 From 9e0c7efa5ea231d85c0d41693a5115b3b971717c Mon Sep 17 00:00:00 2001 From: Juhyung Park Date: Fri, 3 Feb 2023 11:40:29 +0900 Subject: block: remove more NULL checks after bdev_get_queue() bdev_get_queue() never returns NULL. Several commits [1][2] have been made before to remove such superfluous checks, but some still remained. For places where bdev_get_queue() is called solely for NULL checks, it is removed entirely. [1] commit ec9fd2a13d74 ("blk-lib: don't check bdev_get_queue() NULL check") [2] commit fea127b36c93 ("block: remove superfluous check for request queue in bdev_is_zoned()") Signed-off-by: Juhyung Park Reviewed-by: Pankaj Raghav Link: https://lore.kernel.org/r/20230203024029.48260-1-qkrwngud825@gmail.com Signed-off-by: Jens Axboe --- block/blk-zoned.c | 10 ---------- include/linux/blkdev.h | 7 +------ kernel/trace/blktrace.c | 6 +----- 3 files changed, 2 insertions(+), 21 deletions(-) diff --git a/block/blk-zoned.c b/block/blk-zoned.c index 614b575be899..fce9082384d6 100644 --- a/block/blk-zoned.c +++ b/block/blk-zoned.c @@ -334,17 +334,12 @@ int blkdev_report_zones_ioctl(struct block_device *bdev, fmode_t mode, { void __user *argp = (void __user *)arg; struct zone_report_args args; - struct request_queue *q; struct blk_zone_report rep; int ret; if (!argp) return -EINVAL; - q = bdev_get_queue(bdev); - if (!q) - return -ENXIO; - if (!bdev_is_zoned(bdev)) return -ENOTTY; @@ -391,7 +386,6 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, unsigned long arg) { void __user *argp = (void __user *)arg; - struct request_queue *q; struct blk_zone_range zrange; enum req_op op; int ret; @@ -399,10 +393,6 @@ int blkdev_zone_mgmt_ioctl(struct block_device *bdev, fmode_t mode, if (!argp) return -EINVAL; - q = bdev_get_queue(bdev); - if (!q) - return -ENXIO; - if (!bdev_is_zoned(bdev)) return -ENOTTY; diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index b9637d63e6f0..89dd9b02b45b 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1276,12 +1276,7 @@ static inline bool bdev_nowait(struct block_device *bdev) static inline enum blk_zoned_model bdev_zoned_model(struct block_device *bdev) { - struct request_queue *q = bdev_get_queue(bdev); - - if (q) - return blk_queue_zoned_model(q); - - return BLK_ZONED_NONE; + return blk_queue_zoned_model(bdev_get_queue(bdev)); } static inline bool bdev_is_zoned(struct block_device *bdev) diff --git a/kernel/trace/blktrace.c b/kernel/trace/blktrace.c index 5743be559415..d5d94510afd3 100644 --- a/kernel/trace/blktrace.c +++ b/kernel/trace/blktrace.c @@ -729,14 +729,10 @@ EXPORT_SYMBOL_GPL(blk_trace_startstop); **/ int blk_trace_ioctl(struct block_device *bdev, unsigned cmd, char __user *arg) { - struct request_queue *q; + struct request_queue *q = bdev_get_queue(bdev); int ret, start = 0; char b[BDEVNAME_SIZE]; - q = bdev_get_queue(bdev); - if (!q) - return -ENXIO; - mutex_lock(&q->debugfs_mutex); switch (cmd) { -- cgit v1.2.3 From 9c7c4bc986932218fd0df9d2a100509772028fb1 Mon Sep 17 00:00:00 2001 From: Ming Lei Date: Mon, 20 Feb 2023 12:14:13 +0800 Subject: ublk: remove check IO_URING_F_SQE128 in ublk_ch_uring_cmd sizeof(struct ublksrv_io_cmd) is 16bytes, which can be held in 64byte SQE, so not necessary to check IO_URING_F_SQE128. With this change, we get chance to save half SQ ring memory. Fixed: 71f28f3136af ("ublk_drv: add io_uring based userspace block driver") Signed-off-by: Ming Lei Link: https://lore.kernel.org/r/20230220041413.1524335-1-ming.lei@redhat.com Signed-off-by: Jens Axboe --- drivers/block/ublk_drv.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/block/ublk_drv.c b/drivers/block/ublk_drv.c index f48d213fb65e..09d29fa53939 100644 --- a/drivers/block/ublk_drv.c +++ b/drivers/block/ublk_drv.c @@ -1271,9 +1271,6 @@ static int ublk_ch_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags) __func__, cmd->cmd_op, ub_cmd->q_id, tag, ub_cmd->result); - if (!(issue_flags & IO_URING_F_SQE128)) - goto out; - if (ub_cmd->q_id >= ub->dev_info.nr_hw_queues) goto out; -- cgit v1.2.3 From 9f6ad5d533d1c71e51bdd06a5712c4fbc8768dfa Mon Sep 17 00:00:00 2001 From: Zhong Jinghua Date: Tue, 21 Feb 2023 17:50:27 +0800 Subject: loop: loop_set_status_from_info() check before assignment In loop_set_status_from_info(), lo->lo_offset and lo->lo_sizelimit should be checked before reassignment, because if an overflow error occurs, the original correct value will be changed to the wrong value, and it will not be changed back. More, the original patch did not solve the problem, the value was set and ioctl returned an error, but the subsequent io used the value in the loop driver, which still caused an alarm: loop_handle_cmd do_req_filebacked loff_t pos = ((loff_t) blk_rq_pos(rq) << 9) + lo->lo_offset; lo_rw_aio cmd->iocb.ki_pos = pos Fixes: c490a0b5a4f3 ("loop: Check for overflow while configuring loop") Signed-off-by: Zhong Jinghua Reviewed-by: Chaitanya Kulkarni Link: https://lore.kernel.org/r/20230221095027.3656193-1-zhongjinghua@huaweicloud.com Signed-off-by: Jens Axboe --- drivers/block/loop.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/block/loop.c b/drivers/block/loop.c index 5f04235e4ff7..839373451c2b 100644 --- a/drivers/block/loop.c +++ b/drivers/block/loop.c @@ -977,13 +977,13 @@ loop_set_status_from_info(struct loop_device *lo, return -EINVAL; } + /* Avoid assigning overflow values */ + if (info->lo_offset > LLONG_MAX || info->lo_sizelimit > LLONG_MAX) + return -EOVERFLOW; + lo->lo_offset = info->lo_offset; lo->lo_sizelimit = info->lo_sizelimit; - /* loff_t vars have been assigned __u64 */ - if (lo->lo_offset < 0 || lo->lo_sizelimit < 0) - return -EOVERFLOW; - memcpy(lo->lo_file_name, info->lo_file_name, LO_NAME_SIZE); lo->lo_file_name[LO_NAME_SIZE-1] = 0; lo->lo_flags = info->lo_flags; -- cgit v1.2.3 From 11eb695feb636fa5211067189cad25ac073e7fe5 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 24 Feb 2023 09:59:44 -0700 Subject: block: clear bio->bi_bdev when putting a bio back in the cache This isn't strictly needed in terms of correctness, but it does allow polling to know if the bio has been put already by a different task and hence avoid polling something that we don't need to. Cc: stable@vger.kernel.org Fixes: be4d234d7aeb ("bio: add allocation cache abstraction") Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- block/bio.c | 1 + 1 file changed, 1 insertion(+) diff --git a/block/bio.c b/block/bio.c index 2693f34afb7e..605c40025068 100644 --- a/block/bio.c +++ b/block/bio.c @@ -772,6 +772,7 @@ static inline void bio_put_percpu_cache(struct bio *bio) if ((bio->bi_opf & REQ_POLLED) && !WARN_ON_ONCE(in_interrupt())) { bio->bi_next = cache->free_list; + bio->bi_bdev = NULL; cache->free_list = bio; cache->nr++; } else { -- cgit v1.2.3 From 310726c33ad76cebdee312dbfafc12c1b44bf977 Mon Sep 17 00:00:00 2001 From: Jens Axboe Date: Fri, 24 Feb 2023 10:01:19 -0700 Subject: block: be a bit more careful in checking for NULL bdev while polling Wei reports a crash with an application using polled IO: PGD 14265e067 P4D 14265e067 PUD 47ec50067 PMD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 21915 Comm: iocore_0 Kdump: loaded Tainted: G S 5.12.0-0_fbk12_clang_7346_g1bb6f2e7058f #1 Hardware name: Wiwynn Delta Lake MP T8/Delta Lake-Class2, BIOS Y3DLM08 04/10/2022 RIP: 0010:bio_poll+0x25/0x200 Code: 0f 1f 44 00 00 0f 1f 44 00 00 55 41 57 41 56 41 55 41 54 53 48 83 ec 28 65 48 8b 04 25 28 00 00 00 48 89 44 24 20 48 8b 47 08 <48> 8b 80 70 02 00 00 4c 8b 70 50 8b 6f 34 31 db 83 fd ff 75 25 65 RSP: 0018:ffffc90005fafdf8 EFLAGS: 00010292 RAX: 0000000000000000 RBX: 0000000000000000 RCX: 74b43cd65dd66600 RDX: 0000000000000003 RSI: ffffc90005fafe78 RDI: ffff8884b614e140 RBP: ffff88849964df78 R08: 0000000000000000 R09: 0000000000000008 R10: 0000000000000000 R11: 0000000000000000 R12: ffff88849964df00 R13: ffffc90005fafe78 R14: ffff888137d3c378 R15: 0000000000000001 FS: 00007fd195000640(0000) GS:ffff88903f400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 0000000000000270 CR3: 0000000466121001 CR4: 00000000007706f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 PKRU: 55555554 Call Trace: iocb_bio_iopoll+0x1d/0x30 io_do_iopoll+0xac/0x250 __se_sys_io_uring_enter+0x3c5/0x5a0 ? __x64_sys_write+0x89/0xd0 do_syscall_64+0x2d/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xae RIP: 0033:0x94f225d Code: 24 cc 00 00 00 41 8b 84 24 d0 00 00 00 c1 e0 04 83 e0 10 41 09 c2 8b 33 8b 53 04 4c 8b 43 18 4c 63 4b 0c b8 aa 01 00 00 0f 05 <85> c0 0f 88 85 00 00 00 29 03 45 84 f6 0f 84 88 00 00 00 41 f6 c7 RSP: 002b:00007fd194ffcd88 EFLAGS: 00000202 ORIG_RAX: 00000000000001aa RAX: ffffffffffffffda RBX: 00007fd194ffcdc0 RCX: 00000000094f225d RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000000000000007 RBP: 00007fd194ffcdb0 R08: 0000000000000000 R09: 0000000000000008 R10: 0000000000000001 R11: 0000000000000202 R12: 00007fd269d68030 R13: 0000000000000000 R14: 0000000000000001 R15: 0000000000000000 which is due to bio->bi_bdev being NULL. This can happen if we have two tasks doing polled IO, and task B ends up completing IO from task A if they are sharing a poll queue. If task B completes the IO and puts the bio into our cache, then it can allocate that bio again before task A is done polling for it. As that would necessitate a preempt between the two tasks, it's enough to just be a bit more careful in checking for whether or not bio->bi_bdev is NULL. Reported-and-tested-by: Wei Zhang Cc: stable@vger.kernel.org Fixes: be4d234d7aeb ("bio: add allocation cache abstraction") Reviewed-by: Keith Busch Signed-off-by: Jens Axboe --- block/blk-core.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/block/blk-core.c b/block/blk-core.c index 5fb6856745b4..5d7e2ca75234 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -854,10 +854,16 @@ EXPORT_SYMBOL(submit_bio); */ int bio_poll(struct bio *bio, struct io_comp_batch *iob, unsigned int flags) { - struct request_queue *q = bdev_get_queue(bio->bi_bdev); blk_qc_t cookie = READ_ONCE(bio->bi_cookie); + struct block_device *bdev; + struct request_queue *q; int ret = 0; + bdev = READ_ONCE(bio->bi_bdev); + if (!bdev) + return 0; + + q = bdev_get_queue(bdev); if (cookie == BLK_QC_T_NONE || !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) return 0; @@ -926,7 +932,7 @@ int iocb_bio_iopoll(struct kiocb *kiocb, struct io_comp_batch *iob, */ rcu_read_lock(); bio = READ_ONCE(kiocb->private); - if (bio && bio->bi_bdev) + if (bio) ret = bio_poll(bio, iob, flags); rcu_read_unlock(); -- cgit v1.2.3 From c0c33b94cfc23d21f67c7569a8785b33c74d6e3d Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Fri, 24 Feb 2023 07:34:24 -0800 Subject: nvme: fix sparse warning on effects masking The log entries are stored in le32, so use appropriate byte swapping macros. Reported-by: kernel test robot Link: https://lore.kernel.org/oe-kbuild-all/202302242222.PevBhzvC-lkp@intel.com/ Signed-off-by: Keith Busch Reviewed-by: Chaitanya Kulkarni Signed-off-by: Christoph Hellwig --- drivers/nvme/host/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 25968b25d0ba..384138f8f04c 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -3104,7 +3104,7 @@ static void nvme_init_known_nvm_effects(struct nvme_ctrl *ctrl) * Rather than blindly freezing the IO queues for this effect that * doesn't even apply to IO, mask it off. */ - log->acs[nvme_admin_security_recv] &= ~NVME_CMD_EFFECTS_CSE_MASK; + log->acs[nvme_admin_security_recv] &= cpu_to_le32(~NVME_CMD_EFFECTS_CSE_MASK); log->iocs[nvme_cmd_write] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC); log->iocs[nvme_cmd_write_zeroes] |= cpu_to_le32(NVME_CMD_EFFECTS_LBCC); -- cgit v1.2.3 From e33b93650fc5364f773985a3e961e24349330d97 Mon Sep 17 00:00:00 2001 From: Breno Leitao Date: Tue, 28 Feb 2023 03:16:54 -0800 Subject: blk-iocost: Pass gendisk to ioc_refresh_params Current kernel (d2980d8d826554fa6981d621e569a453787472f8) crashes when blk_iocost_init for `nvme1` disk. BUG: kernel NULL pointer dereference, address: 0000000000000050 #PF: supervisor read access in kernel mode #PF: error_code(0x0000) - not-present page blk_iocost_init (include/asm-generic/qspinlock.h:128 include/linux/spinlock.h:203 include/linux/spinlock_api_smp.h:158 include/linux/spinlock.h:400 block/blk-iocost.c:2884) ioc_qos_write (block/blk-iocost.c:3198) ? kretprobe_perf_func (kernel/trace/trace_kprobe.c:1566) ? kernfs_fop_write_iter (include/linux/slab.h:584 fs/kernfs/file.c:311) ? __kmem_cache_alloc_node (mm/slab.h:? mm/slub.c:3452 mm/slub.c:3491) ? _copy_from_iter (arch/x86/include/asm/uaccess_64.h:46 arch/x86/include/asm/uaccess_64.h:52 lib/iov_iter.c:183 lib/iov_iter.c:628) ? kretprobe_dispatcher (kernel/trace/trace_kprobe.c:1693) cgroup_file_write (kernel/cgroup/cgroup.c:4061) kernfs_fop_write_iter (fs/kernfs/file.c:334) vfs_write (include/linux/fs.h:1849 fs/read_write.c:491 fs/read_write.c:584) ksys_write (fs/read_write.c:637) do_syscall_64 (arch/x86/entry/common.c:50 arch/x86/entry/common.c:80) entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:120) This happens because ioc_refresh_params() is being called without a properly initialized ioc->rqos, which is happening later in the callee side. ioc_refresh_params() -> ioc_autop_idx() tries to access ioc->rqos.disk->queue but ioc->rqos.disk is NULL, causing the BUG above. Create function, called ioc_refresh_params_disk(), that is similar to ioc_refresh_params() but where the "struct gendisk" could be passed as an explicit argument. This function will be called when ioc->rqos.disk is not initialized. Fixes: ce57b558604e ("blk-rq-qos: make rq_qos_add and rq_qos_del more useful") Signed-off-by: Breno Leitao Acked-by: Tejun Heo Link: https://lore.kernel.org/r/20230228111654.1778120-1-leitao@debian.org Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe --- block/blk-iocost.c | 26 ++++++++++++++++++++------ 1 file changed, 20 insertions(+), 6 deletions(-) diff --git a/block/blk-iocost.c b/block/blk-iocost.c index ff534e9d92dc..4442c7a85112 100644 --- a/block/blk-iocost.c +++ b/block/blk-iocost.c @@ -800,7 +800,11 @@ static void ioc_refresh_period_us(struct ioc *ioc) ioc_refresh_margins(ioc); } -static int ioc_autop_idx(struct ioc *ioc) +/* + * ioc->rqos.disk isn't initialized when this function is called from + * the init path. + */ +static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk) { int idx = ioc->autop_idx; const struct ioc_params *p = &autop[idx]; @@ -808,11 +812,11 @@ static int ioc_autop_idx(struct ioc *ioc) u64 now_ns; /* rotational? */ - if (!blk_queue_nonrot(ioc->rqos.disk->queue)) + if (!blk_queue_nonrot(disk->queue)) return AUTOP_HDD; /* handle SATA SSDs w/ broken NCQ */ - if (blk_queue_depth(ioc->rqos.disk->queue) == 1) + if (blk_queue_depth(disk->queue) == 1) return AUTOP_SSD_QD1; /* use one of the normal ssd sets */ @@ -901,14 +905,19 @@ static void ioc_refresh_lcoefs(struct ioc *ioc) &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]); } -static bool ioc_refresh_params(struct ioc *ioc, bool force) +/* + * struct gendisk is required as an argument because ioc->rqos.disk + * is not properly initialized when called from the init path. + */ +static bool ioc_refresh_params_disk(struct ioc *ioc, bool force, + struct gendisk *disk) { const struct ioc_params *p; int idx; lockdep_assert_held(&ioc->lock); - idx = ioc_autop_idx(ioc); + idx = ioc_autop_idx(ioc, disk); p = &autop[idx]; if (idx == ioc->autop_idx && !force) @@ -939,6 +948,11 @@ static bool ioc_refresh_params(struct ioc *ioc, bool force) return true; } +static bool ioc_refresh_params(struct ioc *ioc, bool force) +{ + return ioc_refresh_params_disk(ioc, force, ioc->rqos.disk); +} + /* * When an iocg accumulates too much vtime or gets deactivated, we throw away * some vtime, which lowers the overall device utilization. As the exact amount @@ -2880,7 +2894,7 @@ static int blk_iocost_init(struct gendisk *disk) spin_lock_irq(&ioc->lock); ioc->autop_idx = AUTOP_INVALID; - ioc_refresh_params(ioc, true); + ioc_refresh_params_disk(ioc, true, disk); spin_unlock_irq(&ioc->lock); /* -- cgit v1.2.3 From 0dd6fff2aad4e35633fef1ea72838bec5b47559a Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 21 Feb 2023 14:02:25 -0800 Subject: nvme: bring back auto-removal of deleted namespaces during sequential scan Bring back the check of the Identify Namespace return value for the legacy NVMe 1.0-style sequential scanning. While NVMe 1.0 does not support namespace management, there are "modern" cloud solutions like Google Cloud Platform that claim the obsolete 1.0 compliance for no good reason while supporting proprietary sideband namespace management. Fixes: 1a893c2bfef4 ("nvme: refactor namespace probing") Reported-by: Nils Hanke Signed-off-by: Christoph Hellwig Reviewed-by: Keith Busch Reviewed-by: Sagi Grimberg Tested-by: Nils Hanke --- drivers/nvme/host/core.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c index 384138f8f04c..3345f866178e 100644 --- a/drivers/nvme/host/core.c +++ b/drivers/nvme/host/core.c @@ -38,6 +38,7 @@ struct nvme_ns_info { bool is_shared; bool is_readonly; bool is_ready; + bool is_removed; }; unsigned int admin_timeout = 60; @@ -1402,16 +1403,8 @@ static int nvme_identify_ns(struct nvme_ctrl *ctrl, unsigned nsid, error = nvme_submit_sync_cmd(ctrl->admin_q, &c, *id, sizeof(**id)); if (error) { dev_warn(ctrl->device, "Identify namespace failed (%d)\n", error); - goto out_free_id; + kfree(*id); } - - error = NVME_SC_INVALID_NS | NVME_SC_DNR; - if ((*id)->ncap == 0) /* namespace not allocated or attached */ - goto out_free_id; - return 0; - -out_free_id: - kfree(*id); return error; } @@ -1425,6 +1418,13 @@ static int nvme_ns_info_from_identify(struct nvme_ctrl *ctrl, ret = nvme_identify_ns(ctrl, info->nsid, &id); if (ret) return ret; + + if (id->ncap == 0) { + /* namespace not allocated or attached */ + info->is_removed = true; + return -ENODEV; + } + info->anagrpid = id->anagrpid; info->is_shared = id->nmic & NVME_NS_NMIC_SHARED; info->is_readonly = id->nsattr & NVME_NS_ATTR_RO; @@ -4429,6 +4429,7 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid) { struct nvme_ns_info info = { .nsid = nsid }; struct nvme_ns *ns; + int ret; if (nvme_identify_ns_descs(ctrl, &info)) return; @@ -4445,19 +4446,19 @@ static void nvme_scan_ns(struct nvme_ctrl *ctrl, unsigned nsid) * set up a namespace. If not fall back to the legacy version. */ if ((ctrl->cap & NVME_CAP_CRMS_CRIMS) || - (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS)) { - if (nvme_ns_info_from_id_cs_indep(ctrl, &info)) - return; - } else { - if (nvme_ns_info_from_identify(ctrl, &info)) - return; - } + (info.ids.csi != NVME_CSI_NVM && info.ids.csi != NVME_CSI_ZNS)) + ret = nvme_ns_info_from_id_cs_indep(ctrl, &info); + else + ret = nvme_ns_info_from_identify(ctrl, &info); + + if (info.is_removed) + nvme_ns_remove_by_nsid(ctrl, nsid); /* * Ignore the namespace if it is not ready. We will get an AEN once it * becomes ready and restart the scan. */ - if (!info.is_ready) + if (ret || !info.is_ready) return; ns = nvme_find_get_ns(ctrl, nsid); -- cgit v1.2.3 From 51d24f701f453c18cb5f4596d8bbe8034e5d3fb4 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 16 Feb 2023 15:14:49 +0300 Subject: nvme-auth: fix an error code in nvme_auth_process_dhchap_challenge() This function was transitioned from returning NVMe status codes to returning traditional kernel error codes. However, this particular return now accidentally returns positive error codes like ENOMEM instead of negative -ENOMEM. Fixes: b0ef1b11d390 ("nvme-auth: don't use NVMe status codes") Signed-off-by: Dan Carpenter Signed-off-by: Christoph Hellwig --- drivers/nvme/host/auth.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/nvme/host/auth.c b/drivers/nvme/host/auth.c index 787537454f7f..3e96714b8bc0 100644 --- a/drivers/nvme/host/auth.c +++ b/drivers/nvme/host/auth.c @@ -254,7 +254,7 @@ select_kpp: chap->qid, ret, gid_name); chap->status = NVME_AUTH_DHCHAP_FAILURE_DHGROUP_UNUSABLE; chap->dh_tfm = NULL; - return -ret; + return ret; } dev_dbg(ctrl->device, "qid %d: selected DH group %s\n", chap->qid, gid_name); -- cgit v1.2.3 From 76d54bf20cdcc1ed7569a89885e09636e9a8d71d Mon Sep 17 00:00:00 2001 From: Akinobu Mita Date: Sun, 26 Feb 2023 21:42:54 +0900 Subject: nvme-tcp: don't access released socket during error recovery While the error recovery work is temporarily failing reconnect attempts, running the 'nvme list' command causes a kernel NULL pointer dereference by calling getsockname() with a released socket. During error recovery work, the nvme tcp socket is released and a new one created, so it is not safe to access the socket without proper check. Signed-off-by: Akinobu Mita Fixes: 02c57a82c008 ("nvme-tcp: print actual source IP address through sysfs "address" attr") Reviewed-by: Martin Belanger Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/tcp.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c index d6100a787d39..ef27122acce6 100644 --- a/drivers/nvme/host/tcp.c +++ b/drivers/nvme/host/tcp.c @@ -2489,6 +2489,10 @@ static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size) len = nvmf_get_address(ctrl, buf, size); + mutex_lock(&queue->queue_lock); + + if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags)) + goto done; ret = kernel_getsockname(queue->sock, (struct sockaddr *)&src_addr); if (ret > 0) { if (len > 0) @@ -2496,6 +2500,8 @@ static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size) len += scnprintf(buf + len, size - len, "%ssrc_addr=%pISc\n", (len) ? "," : "", &src_addr); } +done: + mutex_unlock(&queue->queue_lock); return len; } -- cgit v1.2.3 From 26a57cb35548ae67c14871cccbf50da3edb01ea4 Mon Sep 17 00:00:00 2001 From: Daniel Wagner Date: Tue, 21 Feb 2023 17:51:06 +0100 Subject: nvme-fabrics: show well known discovery name The kernel always logs the unique subsystem name for a discovery controller, even in the case user space asked for the well known. This has lead to confusion as the logs of nvme-cli and the kernel logs didn't match. First, nvme-cli connects to the well known discovery controller to figure out if it supports TP8013. If so then nvme-cli disconnects and connects to the unique discovery controller. Currently, the kernel show that user space connected twice to the unique one. To avoid further confusion, show the well known discovery controller if user space asked for it: $ nvme connect-all -v -t tcp -a 192.168.0.1 nvme0: nqn.2014-08.org.nvmexpress.discovery connected nvme0: nqn.2014-08.org.nvmexpress.discovery disconnected nvme0: nqn.discovery connected kernel log: nvme nvme0: new ctrl: NQN "nqn.2014-08.org.nvmexpress.discovery", addr 192.168.0.1:8009 nvme nvme0: Removing ctrl: NQN "nqn.2014-08.org.nvmexpress.discovery" nvme nvme0: new ctrl: NQN "nqn.discovery", addr 192.168.0.1:8009 Fixes: e5ea42faa773 ("nvme: display correct subsystem NQN") Signed-off-by: Daniel Wagner Reviewed-by: Hannes Reinecke Signed-off-by: Christoph Hellwig --- drivers/nvme/host/fabrics.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nvme/host/fabrics.h b/drivers/nvme/host/fabrics.h index a6e22116e139..dcac3df8a5f7 100644 --- a/drivers/nvme/host/fabrics.h +++ b/drivers/nvme/host/fabrics.h @@ -189,7 +189,8 @@ nvmf_ctlr_matches_baseopts(struct nvme_ctrl *ctrl, static inline char *nvmf_ctrl_subsysnqn(struct nvme_ctrl *ctrl) { - if (!ctrl->subsys) + if (!ctrl->subsys || + !strcmp(ctrl->opts->subsysnqn, NVME_DISC_SUBSYS_NAME)) return ctrl->opts->subsysnqn; return ctrl->subsys->subnqn; } -- cgit v1.2.3 From 49d24398327e32265eccdeec4baeb5a6a609c0bd Mon Sep 17 00:00:00 2001 From: Uday Shankar Date: Tue, 28 Feb 2023 17:06:55 -0700 Subject: blk-mq: enforce op-specific segment limits in blk_insert_cloned_request The block layer might merge together discard requests up until the max_discard_segments limit is hit, but blk_insert_cloned_request checks the segment count against max_segments regardless of the req op. This can result in errors like the following when discards are issued through a DM device and max_discard_segments exceeds max_segments for the queue of the chosen underlying device. blk_insert_cloned_request: over max segments limit. (256 > 129) Fix this by looking at the req_op and enforcing the appropriate segment limit - max_discard_segments for REQ_OP_DISCARDs and max_segments for everything else. Signed-off-by: Uday Shankar Reviewed-by: Keith Busch Reviewed-by: Ming Lei Link: https://lore.kernel.org/r/20230301000655.48112-1-ushankar@purestorage.com Signed-off-by: Jens Axboe --- block/blk-merge.c | 7 ------- block/blk-mq.c | 7 ++++--- block/blk.h | 7 +++++++ 3 files changed, 11 insertions(+), 10 deletions(-) diff --git a/block/blk-merge.c b/block/blk-merge.c index 808b58129d3e..ff72edd7ee03 100644 --- a/block/blk-merge.c +++ b/block/blk-merge.c @@ -586,13 +586,6 @@ int __blk_rq_map_sg(struct request_queue *q, struct request *rq, } EXPORT_SYMBOL(__blk_rq_map_sg); -static inline unsigned int blk_rq_get_max_segments(struct request *rq) -{ - if (req_op(rq) == REQ_OP_DISCARD) - return queue_max_discard_segments(rq->q); - return queue_max_segments(rq->q); -} - static inline unsigned int blk_rq_get_max_sectors(struct request *rq, sector_t offset) { diff --git a/block/blk-mq.c b/block/blk-mq.c index 08093d4348dd..7c6f812323af 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -3000,6 +3000,7 @@ blk_status_t blk_insert_cloned_request(struct request *rq) { struct request_queue *q = rq->q; unsigned int max_sectors = blk_queue_get_max_sectors(q, req_op(rq)); + unsigned int max_segments = blk_rq_get_max_segments(rq); blk_status_t ret; if (blk_rq_sectors(rq) > max_sectors) { @@ -3026,9 +3027,9 @@ blk_status_t blk_insert_cloned_request(struct request *rq) * original queue. */ rq->nr_phys_segments = blk_recalc_rq_segments(rq); - if (rq->nr_phys_segments > queue_max_segments(q)) { - printk(KERN_ERR "%s: over max segments limit. (%hu > %hu)\n", - __func__, rq->nr_phys_segments, queue_max_segments(q)); + if (rq->nr_phys_segments > max_segments) { + printk(KERN_ERR "%s: over max segments limit. (%u > %u)\n", + __func__, rq->nr_phys_segments, max_segments); return BLK_STS_IOERR; } diff --git a/block/blk.h b/block/blk.h index e835f21d48af..cc4e8873dfde 100644 --- a/block/blk.h +++ b/block/blk.h @@ -156,6 +156,13 @@ static inline bool blk_discard_mergable(struct request *req) return false; } +static inline unsigned int blk_rq_get_max_segments(struct request *rq) +{ + if (req_op(rq) == REQ_OP_DISCARD) + return queue_max_discard_segments(rq->q); + return queue_max_segments(rq->q); +} + static inline unsigned int blk_queue_get_max_sectors(struct request_queue *q, enum req_op op) { -- cgit v1.2.3