diff options
author | Stephen Rothwell <sfr@canb.auug.org.au> | 2017-02-17 12:00:09 +1100 |
---|---|---|
committer | Stephen Rothwell <sfr@canb.auug.org.au> | 2017-02-17 12:00:09 +1100 |
commit | 698194ed41e5484c570b9ebab5af65c0c64e6817 (patch) | |
tree | 78069b5f6298d80419951481f94203e3822539b2 | |
parent | f66402c8c4d7224a96d430bd6919a46da75c1be9 (diff) | |
parent | eae8263fb1f4256460270dd8f42334604dcdfac6 (diff) |
Merge remote-tracking branch 'md/for-next'
-rw-r--r-- | Documentation/00-INDEX | 4 | ||||
-rw-r--r-- | Documentation/admin-guide/md.rst | 5 | ||||
-rw-r--r-- | Documentation/md/md-cluster.txt (renamed from Documentation/md-cluster.txt) | 0 | ||||
-rw-r--r-- | Documentation/md/raid5-cache.txt | 109 | ||||
-rw-r--r-- | block/bio.c | 61 | ||||
-rw-r--r-- | drivers/md/faulty.c | 2 | ||||
-rw-r--r-- | drivers/md/linear.c | 40 | ||||
-rw-r--r-- | drivers/md/linear.h | 1 | ||||
-rw-r--r-- | drivers/md/md.c | 22 | ||||
-rw-r--r-- | drivers/md/md.h | 9 | ||||
-rw-r--r-- | drivers/md/multipath.c | 1 | ||||
-rw-r--r-- | drivers/md/raid0.c | 1 | ||||
-rw-r--r-- | drivers/md/raid1.c | 28 | ||||
-rw-r--r-- | drivers/md/raid10.c | 11 | ||||
-rw-r--r-- | drivers/md/raid5-cache.c | 225 | ||||
-rw-r--r-- | drivers/md/raid5.c | 129 | ||||
-rw-r--r-- | drivers/md/raid5.h | 7 | ||||
-rw-r--r-- | include/linux/bio.h | 11 | ||||
-rw-r--r-- | lib/radix-tree.c | 1 |
19 files changed, 561 insertions, 106 deletions
diff --git a/Documentation/00-INDEX b/Documentation/00-INDEX index c8a8eb1a2b11..793acf999e9e 100644 --- a/Documentation/00-INDEX +++ b/Documentation/00-INDEX @@ -270,8 +270,8 @@ m68k/ - directory with info about Linux on Motorola 68k architecture. mailbox.txt - How to write drivers for the common mailbox framework (IPC). -md-cluster.txt - - info on shared-device RAID MD cluster. +md/ + - directory with info about Linux Software RAID media/ - info on media drivers: uAPI, kAPI and driver documentation. memory-barriers.txt diff --git a/Documentation/admin-guide/md.rst b/Documentation/admin-guide/md.rst index e449fb5f277c..1e61bf50595c 100644 --- a/Documentation/admin-guide/md.rst +++ b/Documentation/admin-guide/md.rst @@ -725,3 +725,8 @@ These currently include: to 1. Setting this to 0 disables bypass accounting and requires preread stripes to wait until all full-width stripe- writes are complete. Valid values are 0 to stripe_cache_size. + + journal_mode (currently raid5 only) + The cache mode for raid5. raid5 could include an extra disk for + caching. The mode can be "write-throuth" and "write-back". The + default is "write-through". diff --git a/Documentation/md-cluster.txt b/Documentation/md/md-cluster.txt index 38883276d31c..38883276d31c 100644 --- a/Documentation/md-cluster.txt +++ b/Documentation/md/md-cluster.txt diff --git a/Documentation/md/raid5-cache.txt b/Documentation/md/raid5-cache.txt new file mode 100644 index 000000000000..2b210f295786 --- /dev/null +++ b/Documentation/md/raid5-cache.txt @@ -0,0 +1,109 @@ +RAID5 cache + +Raid 4/5/6 could include an extra disk for data cache besides normal RAID +disks. The role of RAID disks isn't changed with the cache disk. The cache disk +caches data to the RAID disks. The cache can be in write-through (supported +since 4.4) or write-back mode (supported since 4.10). mdadm (supported since +3.4) has a new option '--write-journal' to create array with cache. Please +refer to mdadm manual for details. By default (RAID array starts), the cache is +in write-through mode. A user can switch it to write-back mode by: + +echo "write-back" > /sys/block/md0/md/journal_mode + +And switch it back to write-through mode by: + +echo "write-through" > /sys/block/md0/md/journal_mode + +In both modes, all writes to the array will hit cache disk first. This means +the cache disk must be fast and sustainable. + +------------------------------------- +write-through mode: + +This mode mainly fixes the 'write hole' issue. For RAID 4/5/6 array, an unclean +shutdown can cause data in some stripes to not be in consistent state, eg, data +and parity don't match. The reason is that a stripe write involves several RAID +disks and it's possible the writes don't hit all RAID disks yet before the +unclean shutdown. We call an array degraded if it has inconsistent data. MD +tries to resync the array to bring it back to normal state. But before the +resync completes, any system crash will expose the chance of real data +corruption in the RAID array. This problem is called 'write hole'. + +The write-through cache will cache all data on cache disk first. After the data +is safe on the cache disk, the data will be flushed onto RAID disks. The +two-step write will guarantee MD can recover correct data after unclean +shutdown even the array is degraded. Thus the cache can close the 'write hole'. + +In write-through mode, MD reports IO completion to upper layer (usually +filesystems) after the data is safe on RAID disks, so cache disk failure +doesn't cause data loss. Of course cache disk failure means the array is +exposed to 'write hole' again. + +In write-through mode, the cache disk isn't required to be big. Several +hundreds megabytes are enough. + +-------------------------------------- +write-back mode: + +write-back mode fixes the 'write hole' issue too, since all write data is +cached on cache disk. But the main goal of 'write-back' cache is to speed up +write. If a write crosses all RAID disks of a stripe, we call it full-stripe +write. For non-full-stripe writes, MD must read old data before the new parity +can be calculated. These synchronous reads hurt write throughput. Some writes +which are sequential but not dispatched in the same time will suffer from this +overhead too. Write-back cache will aggregate the data and flush the data to +RAID disks only after the data becomes a full stripe write. This will +completely avoid the overhead, so it's very helpful for some workloads. A +typical workload which does sequential write followed by fsync is an example. + +In write-back mode, MD reports IO completion to upper layer (usually +filesystems) right after the data hits cache disk. The data is flushed to raid +disks later after specific conditions met. So cache disk failure will cause +data loss. + +In write-back mode, MD also caches data in memory. The memory cache includes +the same data stored on cache disk, so a power loss doesn't cause data loss. +The memory cache size has performance impact for the array. It's recommended +the size is big. A user can configure the size by: + +echo "2048" > /sys/block/md0/md/stripe_cache_size + +Too small cache disk will make the write aggregation less efficient in this +mode depending on the workloads. It's recommended to use a cache disk with at +least several gigabytes size in write-back mode. + +-------------------------------------- +The implementation: + +The write-through and write-back cache use the same disk format. The cache disk +is organized as a simple write log. The log consists of 'meta data' and 'data' +pairs. The meta data describes the data. It also includes checksum and sequence +ID for recovery identification. Data can be IO data and parity data. Data is +checksumed too. The checksum is stored in the meta data ahead of the data. The +checksum is an optimization because MD can write meta and data freely without +worry about the order. MD superblock has a field pointed to the valid meta data +of log head. + +The log implementation is pretty straightforward. The difficult part is the +order in which MD writes data to cache disk and RAID disks. Specifically, in +write-through mode, MD calculates parity for IO data, writes both IO data and +parity to the log, writes the data and parity to RAID disks after the data and +parity is settled down in log and finally the IO is finished. Read just reads +from raid disks as usual. + +In write-back mode, MD writes IO data to the log and reports IO completion. The +data is also fully cached in memory at that time, which means read must query +memory cache. If some conditions are met, MD will flush the data to RAID disks. +MD will calculate parity for the data and write parity into the log. After this +is finished, MD will write both data and parity into RAID disks, then MD can +release the memory cache. The flush conditions could be stripe becomes a full +stripe write, free cache disk space is low or free in-kernel memory cache space +is low. + +After an unclean shutdown, MD does recovery. MD reads all meta data and data +from the log. The sequence ID and checksum will help us detect corrupted meta +data and data. If MD finds a stripe with data and valid parities (1 parity for +raid4/5 and 2 for raid6), MD will write the data and parities to RAID disks. If +parities are incompleted, they are discarded. If part of data is corrupted, +they are discarded too. MD then loads valid data and writes them to RAID disks +in normal way. diff --git a/block/bio.c b/block/bio.c index 4b564d0c3e29..5eec5e08417f 100644 --- a/block/bio.c +++ b/block/bio.c @@ -625,21 +625,20 @@ struct bio *bio_clone_fast(struct bio *bio, gfp_t gfp_mask, struct bio_set *bs) } EXPORT_SYMBOL(bio_clone_fast); -/** - * bio_clone_bioset - clone a bio - * @bio_src: bio to clone - * @gfp_mask: allocation priority - * @bs: bio_set to allocate from - * - * Clone bio. Caller will own the returned bio, but not the actual data it - * points to. Reference count of returned bio will be one. - */ -struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, - struct bio_set *bs) +static struct bio *__bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, + struct bio_set *bs, int offset, + int size) { struct bvec_iter iter; struct bio_vec bv; struct bio *bio; + struct bvec_iter iter_src = bio_src->bi_iter; + + /* for supporting partial clone */ + if (offset || size != bio_src->bi_iter.bi_size) { + bio_advance_iter(bio_src, &iter_src, offset); + iter_src.bi_size = size; + } /* * Pre immutable biovecs, __bio_clone() used to just do a memcpy from @@ -663,7 +662,8 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, * __bio_clone_fast() anyways. */ - bio = bio_alloc_bioset(gfp_mask, bio_segments(bio_src), bs); + bio = bio_alloc_bioset(gfp_mask, __bio_segments(bio_src, + &iter_src), bs); if (!bio) return NULL; bio->bi_bdev = bio_src->bi_bdev; @@ -680,7 +680,7 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, bio->bi_io_vec[bio->bi_vcnt++] = bio_src->bi_io_vec[0]; break; default: - bio_for_each_segment(bv, bio_src, iter) + __bio_for_each_segment(bv, bio_src, iter, iter_src) bio->bi_io_vec[bio->bi_vcnt++] = bv; break; } @@ -699,9 +699,44 @@ struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, return bio; } + +/** + * bio_clone_bioset - clone a bio + * @bio_src: bio to clone + * @gfp_mask: allocation priority + * @bs: bio_set to allocate from + * + * Clone bio. Caller will own the returned bio, but not the actual data it + * points to. Reference count of returned bio will be one. + */ +struct bio *bio_clone_bioset(struct bio *bio_src, gfp_t gfp_mask, + struct bio_set *bs) +{ + return __bio_clone_bioset(bio_src, gfp_mask, bs, 0, + bio_src->bi_iter.bi_size); +} EXPORT_SYMBOL(bio_clone_bioset); /** + * bio_clone_bioset_partial - clone a partial bio + * @bio_src: bio to clone + * @gfp_mask: allocation priority + * @bs: bio_set to allocate from + * @offset: cloned starting from the offset + * @size: size for the cloned bio + * + * Clone bio. Caller will own the returned bio, but not the actual data it + * points to. Reference count of returned bio will be one. + */ +struct bio *bio_clone_bioset_partial(struct bio *bio_src, gfp_t gfp_mask, + struct bio_set *bs, int offset, + int size) +{ + return __bio_clone_bioset(bio_src, gfp_mask, bs, offset, size); +} +EXPORT_SYMBOL(bio_clone_bioset_partial); + +/** * bio_add_pc_page - attempt to add page to bio * @q: the target queue * @bio: destination bio diff --git a/drivers/md/faulty.c b/drivers/md/faulty.c index 685aa2d77e25..b0536cfd8e17 100644 --- a/drivers/md/faulty.c +++ b/drivers/md/faulty.c @@ -214,7 +214,7 @@ static void faulty_make_request(struct mddev *mddev, struct bio *bio) } } if (failit) { - struct bio *b = bio_clone_mddev(bio, GFP_NOIO, mddev); + struct bio *b = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); b->bi_bdev = conf->rdev->bdev; b->bi_private = bio; diff --git a/drivers/md/linear.c b/drivers/md/linear.c index f1c7bbac31a5..a6a50be86c27 100644 --- a/drivers/md/linear.c +++ b/drivers/md/linear.c @@ -53,18 +53,26 @@ static inline struct dev_info *which_dev(struct mddev *mddev, sector_t sector) return conf->disks + lo; } +/* + * In linear_congested() conf->raid_disks is used as a copy of + * mddev->raid_disks to iterate conf->disks[], because conf->raid_disks + * and conf->disks[] are created in linear_conf(), they are always + * consitent with each other, but mddev->raid_disks does not. + */ static int linear_congested(struct mddev *mddev, int bits) { struct linear_conf *conf; int i, ret = 0; - conf = mddev->private; + rcu_read_lock(); + conf = rcu_dereference(mddev->private); - for (i = 0; i < mddev->raid_disks && !ret ; i++) { + for (i = 0; i < conf->raid_disks && !ret ; i++) { struct request_queue *q = bdev_get_queue(conf->disks[i].rdev->bdev); ret |= bdi_congested(q->backing_dev_info, bits); } + rcu_read_unlock(); return ret; } @@ -144,6 +152,19 @@ static struct linear_conf *linear_conf(struct mddev *mddev, int raid_disks) conf->disks[i-1].end_sector + conf->disks[i].rdev->sectors; + /* + * conf->raid_disks is copy of mddev->raid_disks. The reason to + * keep a copy of mddev->raid_disks in struct linear_conf is, + * mddev->raid_disks may not be consistent with pointers number of + * conf->disks[] when it is updated in linear_add() and used to + * iterate old conf->disks[] earray in linear_congested(). + * Here conf->raid_disks is always consitent with number of + * pointers in conf->disks[] array, and mddev->private is updated + * with rcu_assign_pointer() in linear_addr(), such race can be + * avoided. + */ + conf->raid_disks = raid_disks; + return conf; out: @@ -196,15 +217,23 @@ static int linear_add(struct mddev *mddev, struct md_rdev *rdev) if (!newconf) return -ENOMEM; + /* newconf->raid_disks already keeps a copy of * the increased + * value of mddev->raid_disks, WARN_ONCE() is just used to make + * sure of this. It is possible that oldconf is still referenced + * in linear_congested(), therefore kfree_rcu() is used to free + * oldconf until no one uses it anymore. + */ mddev_suspend(mddev); - oldconf = mddev->private; + oldconf = rcu_dereference(mddev->private); mddev->raid_disks++; - mddev->private = newconf; + WARN_ONCE(mddev->raid_disks != newconf->raid_disks, + "copied raid_disks doesn't match mddev->raid_disks"); + rcu_assign_pointer(mddev->private, newconf); md_set_array_sectors(mddev, linear_size(mddev, 0, 0)); set_capacity(mddev->gendisk, mddev->array_sectors); mddev_resume(mddev); revalidate_disk(mddev->gendisk); - kfree(oldconf); + kfree_rcu(oldconf, rcu); return 0; } @@ -262,6 +291,7 @@ static void linear_make_request(struct mddev *mddev, struct bio *bio) trace_block_bio_remap(bdev_get_queue(split->bi_bdev), split, disk_devt(mddev->gendisk), bio_sector); + mddev_check_writesame(mddev, split); generic_make_request(split); } } while (split != bio); diff --git a/drivers/md/linear.h b/drivers/md/linear.h index b685ddd7d7f7..8d392e6098b3 100644 --- a/drivers/md/linear.h +++ b/drivers/md/linear.h @@ -10,6 +10,7 @@ struct linear_conf { struct rcu_head rcu; sector_t array_sectors; + int raid_disks; /* a copy of mddev->raid_disks */ struct dev_info disks[0]; }; #endif diff --git a/drivers/md/md.c b/drivers/md/md.c index ba485dcf1064..985374f20e2e 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -190,16 +190,6 @@ struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, } EXPORT_SYMBOL_GPL(bio_alloc_mddev); -struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, - struct mddev *mddev) -{ - if (!mddev || !mddev->bio_set) - return bio_clone(bio, gfp_mask); - - return bio_clone_bioset(bio, gfp_mask, mddev->bio_set); -} -EXPORT_SYMBOL_GPL(bio_clone_mddev); - /* * We have a system wide 'event count' that is incremented * on any 'interesting' event, and readers of /proc/mdstat @@ -5228,8 +5218,11 @@ int md_run(struct mddev *mddev) sysfs_notify_dirent_safe(rdev->sysfs_state); } - if (mddev->bio_set == NULL) + if (mddev->bio_set == NULL) { mddev->bio_set = bioset_create(BIO_POOL_SIZE, 0); + if (!mddev->bio_set) + return -ENOMEM; + } spin_lock(&pers_lock); pers = find_pers(mddev->level, mddev->clevel); @@ -8980,7 +8973,14 @@ static __exit void md_exit(void) for_each_mddev(mddev, tmp) { export_array(mddev); + mddev->ctime = 0; mddev->hold_active = 0; + /* + * for_each_mddev() will call mddev_put() at the end of each + * iteration. As the mddev is now fully clear, this will + * schedule the mddev for destruction by a workqueue, and the + * destroy_workqueue() below will wait for that to complete. + */ } destroy_workqueue(md_misc_wq); destroy_workqueue(md_wq); diff --git a/drivers/md/md.h b/drivers/md/md.h index 2a514036a83d..b8859cbf84b6 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -673,8 +673,6 @@ extern void md_rdev_clear(struct md_rdev *rdev); extern void mddev_suspend(struct mddev *mddev); extern void mddev_resume(struct mddev *mddev); -extern struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask, - struct mddev *mddev); extern struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs, struct mddev *mddev); @@ -710,4 +708,11 @@ static inline void mddev_clear_unsupported_flags(struct mddev *mddev, { mddev->flags &= ~unsupported_flags; } + +static inline void mddev_check_writesame(struct mddev *mddev, struct bio *bio) +{ + if (bio_op(bio) == REQ_OP_WRITE_SAME && + !bdev_get_queue(bio->bi_bdev)->limits.max_write_same_sectors) + mddev->queue->limits.max_write_same_sectors = 0; +} #endif /* _MD_MD_H */ diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index d457afa672d5..79a12b59250b 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -138,6 +138,7 @@ static void multipath_make_request(struct mddev *mddev, struct bio * bio) mp_bh->bio.bi_opf |= REQ_FAILFAST_TRANSPORT; mp_bh->bio.bi_end_io = multipath_end_request; mp_bh->bio.bi_private = mp_bh; + mddev_check_writesame(mddev, &mp_bh->bio); generic_make_request(&mp_bh->bio); return; } diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c index d6585239bff2..93347ca7c7a6 100644 --- a/drivers/md/raid0.c +++ b/drivers/md/raid0.c @@ -503,6 +503,7 @@ static void raid0_make_request(struct mddev *mddev, struct bio *bio) trace_block_bio_remap(bdev_get_queue(split->bi_bdev), split, disk_devt(mddev->gendisk), bio_sector); + mddev_check_writesame(mddev, split); generic_make_request(split); } } while (split != bio); diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 830ff2b20346..85f309836fd7 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1108,7 +1108,7 @@ read_again: r1_bio->read_disk = rdisk; r1_bio->start_next_window = 0; - read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); + read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors); @@ -1341,13 +1341,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, first_clone = 1; for (i = 0; i < disks; i++) { - struct bio *mbio; + struct bio *mbio = NULL; + sector_t offset; if (!r1_bio->bios[i]) continue; - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); - bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, - max_sectors); + offset = r1_bio->sector - bio->bi_iter.bi_sector; if (first_clone) { /* do behind I/O ? @@ -1357,8 +1356,13 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, if (bitmap && (atomic_read(&bitmap->behind_writes) < mddev->bitmap_info.max_write_behind) && - !waitqueue_active(&bitmap->behind_wait)) + !waitqueue_active(&bitmap->behind_wait)) { + mbio = bio_clone_bioset_partial(bio, GFP_NOIO, + mddev->bio_set, + offset, + max_sectors); alloc_behind_pages(mbio, r1_bio); + } bitmap_startwrite(bitmap, r1_bio->sector, r1_bio->sectors, @@ -1366,6 +1370,12 @@ static void raid1_write_request(struct mddev *mddev, struct bio *bio, &r1_bio->state)); first_clone = 0; } + + if (!mbio) { + mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); + bio_trim(mbio, offset, max_sectors); + } + if (r1_bio->behind_bvecs) { struct bio_vec *bvec; int j; @@ -2273,7 +2283,8 @@ static int narrow_write_error(struct r1bio *r1_bio, int i) wbio->bi_vcnt = vcnt; } else { - wbio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); + wbio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO, + mddev->bio_set); } bio_set_op_attrs(wbio, REQ_OP_WRITE, 0); @@ -2411,7 +2422,8 @@ read_more: const unsigned long do_sync = r1_bio->master_bio->bi_opf & REQ_SYNC; r1_bio->read_disk = disk; - bio = bio_clone_mddev(r1_bio->master_bio, GFP_NOIO, mddev); + bio = bio_clone_fast(r1_bio->master_bio, GFP_NOIO, + mddev->bio_set); bio_trim(bio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors); r1_bio->bios[r1_bio->read_disk] = bio; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 6bc5c2a85160..063c43d83b72 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1132,7 +1132,7 @@ read_again: } slot = r10_bio->read_slot; - read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev); + read_bio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); @@ -1406,7 +1406,7 @@ retry_write: int d = r10_bio->devs[i].devnum; if (r10_bio->devs[i].bio) { struct md_rdev *rdev = conf->mirrors[d].rdev; - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); r10_bio->devs[i].bio = mbio; @@ -1457,7 +1457,7 @@ retry_write: smp_mb(); rdev = conf->mirrors[d].rdev; } - mbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + mbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); bio_trim(mbio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); r10_bio->devs[i].repl_bio = mbio; @@ -2565,7 +2565,7 @@ static int narrow_write_error(struct r10bio *r10_bio, int i) if (sectors > sect_to_write) sectors = sect_to_write; /* Write at 'sector' for 'sectors' */ - wbio = bio_clone_mddev(bio, GFP_NOIO, mddev); + wbio = bio_clone_fast(bio, GFP_NOIO, mddev->bio_set); bio_trim(wbio, sector - bio->bi_iter.bi_sector, sectors); wsector = r10_bio->devs[i].addr + (sector - r10_bio->sector); wbio->bi_iter.bi_sector = wsector + @@ -2641,8 +2641,7 @@ read_more: mdname(mddev), bdevname(rdev->bdev, b), (unsigned long long)r10_bio->sector); - bio = bio_clone_mddev(r10_bio->master_bio, - GFP_NOIO, mddev); + bio = bio_clone_fast(r10_bio->master_bio, GFP_NOIO, mddev->bio_set); bio_trim(bio, r10_bio->sector - bio->bi_iter.bi_sector, max_sectors); r10_bio->devs[slot].bio = bio; r10_bio->devs[slot].rdev = rdev; diff --git a/drivers/md/raid5-cache.c b/drivers/md/raid5-cache.c index 302dea3296ba..3f307be01b10 100644 --- a/drivers/md/raid5-cache.c +++ b/drivers/md/raid5-cache.c @@ -20,6 +20,7 @@ #include <linux/crc32c.h> #include <linux/random.h> #include <linux/kthread.h> +#include <linux/types.h> #include "md.h" #include "raid5.h" #include "bitmap.h" @@ -164,9 +165,60 @@ struct r5l_log { struct work_struct deferred_io_work; /* to disable write back during in degraded mode */ struct work_struct disable_writeback_work; + + /* to for chunk_aligned_read in writeback mode, details below */ + spinlock_t tree_lock; + struct radix_tree_root big_stripe_tree; }; /* + * Enable chunk_aligned_read() with write back cache. + * + * Each chunk may contain more than one stripe (for example, a 256kB + * chunk contains 64 4kB-page, so this chunk contain 64 stripes). For + * chunk_aligned_read, these stripes are grouped into one "big_stripe". + * For each big_stripe, we count how many stripes of this big_stripe + * are in the write back cache. These data are tracked in a radix tree + * (big_stripe_tree). We use radix_tree item pointer as the counter. + * r5c_tree_index() is used to calculate keys for the radix tree. + * + * chunk_aligned_read() calls r5c_big_stripe_cached() to look up + * big_stripe of each chunk in the tree. If this big_stripe is in the + * tree, chunk_aligned_read() aborts. This look up is protected by + * rcu_read_lock(). + * + * It is necessary to remember whether a stripe is counted in + * big_stripe_tree. Instead of adding new flag, we reuses existing flags: + * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE. If either of these + * two flags are set, the stripe is counted in big_stripe_tree. This + * requires moving set_bit(STRIPE_R5C_PARTIAL_STRIPE) to + * r5c_try_caching_write(); and moving clear_bit of + * STRIPE_R5C_PARTIAL_STRIPE and STRIPE_R5C_FULL_STRIPE to + * r5c_finish_stripe_write_out(). + */ + +/* + * radix tree requests lowest 2 bits of data pointer to be 2b'00. + * So it is necessary to left shift the counter by 2 bits before using it + * as data pointer of the tree. + */ +#define R5C_RADIX_COUNT_SHIFT 2 + +/* + * calculate key for big_stripe_tree + * + * sect: align_bi->bi_iter.bi_sector or sh->sector + */ +static inline sector_t r5c_tree_index(struct r5conf *conf, + sector_t sect) +{ + sector_t offset; + + offset = sector_div(sect, conf->chunk_sectors); + return sect; +} + +/* * an IO range starts from a meta data block and end at the next meta data * block. The io unit's the meta data block tracks data/parity followed it. io * unit is written to log disk with normal write, as we always flush log disk @@ -337,17 +389,30 @@ void r5c_check_cached_full_stripe(struct r5conf *conf) /* * Total log space (in sectors) needed to flush all data in cache * - * Currently, writing-out phase automatically includes all pending writes - * to the same sector. So the reclaim of each stripe takes up to - * (conf->raid_disks + 1) pages of log space. + * To avoid deadlock due to log space, it is necessary to reserve log + * space to flush critical stripes (stripes that occupying log space near + * last_checkpoint). This function helps check how much log space is + * required to flush all cached stripes. * - * To totally avoid deadlock due to log space, the code reserves - * (conf->raid_disks + 1) pages for each stripe in cache, which is not - * necessary in most cases. + * To reduce log space requirements, two mechanisms are used to give cache + * flush higher priorities: + * 1. In handle_stripe_dirtying() and schedule_reconstruction(), + * stripes ALREADY in journal can be flushed w/o pending writes; + * 2. In r5l_write_stripe() and r5c_cache_data(), stripes NOT in journal + * can be delayed (r5l_add_no_space_stripe). * - * To improve this, we will need writing-out phase to be able to NOT include - * pending writes, which will reduce the requirement to - * (conf->max_degraded + 1) pages per stripe in cache. + * In cache flush, the stripe goes through 1 and then 2. For a stripe that + * already passed 1, flushing it requires at most (conf->max_degraded + 1) + * pages of journal space. For stripes that has not passed 1, flushing it + * requires (conf->raid_disks + 1) pages of journal space. There are at + * most (conf->group_cnt + 1) stripe that passed 1. So total journal space + * required to flush all cached stripes (in pages) is: + * + * (stripe_in_journal_count - group_cnt - 1) * (max_degraded + 1) + + * (group_cnt + 1) * (raid_disks + 1) + * or + * (stripe_in_journal_count) * (max_degraded + 1) + + * (group_cnt + 1) * (raid_disks - max_degraded) */ static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) { @@ -356,8 +421,9 @@ static sector_t r5c_log_required_to_flush_cache(struct r5conf *conf) if (!r5c_is_writeback(log)) return 0; - return BLOCK_SECTORS * (conf->raid_disks + 1) * - atomic_read(&log->stripe_in_journal_count); + return BLOCK_SECTORS * + ((conf->max_degraded + 1) * atomic_read(&log->stripe_in_journal_count) + + (conf->raid_disks - conf->max_degraded) * (conf->group_cnt + 1)); } /* @@ -412,16 +478,6 @@ void r5c_make_stripe_write_out(struct stripe_head *sh) if (!test_and_set_bit(STRIPE_PREREAD_ACTIVE, &sh->state)) atomic_inc(&conf->preread_active_stripes); - - if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { - BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); - atomic_dec(&conf->r5c_cached_partial_stripes); - } - - if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { - BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); - atomic_dec(&conf->r5c_cached_full_stripes); - } } static void r5c_handle_data_cached(struct stripe_head *sh) @@ -1271,6 +1327,10 @@ static void r5c_flush_stripe(struct r5conf *conf, struct stripe_head *sh) atomic_inc(&conf->active_stripes); r5c_make_stripe_write_out(sh); + if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) + atomic_inc(&conf->r5c_flushing_partial_stripes); + else + atomic_inc(&conf->r5c_flushing_full_stripes); raid5_release_stripe(sh); } @@ -1313,12 +1373,16 @@ static void r5c_do_reclaim(struct r5conf *conf) unsigned long flags; int total_cached; int stripes_to_flush; + int flushing_partial, flushing_full; if (!r5c_is_writeback(log)) return; + flushing_partial = atomic_read(&conf->r5c_flushing_partial_stripes); + flushing_full = atomic_read(&conf->r5c_flushing_full_stripes); total_cached = atomic_read(&conf->r5c_cached_partial_stripes) + - atomic_read(&conf->r5c_cached_full_stripes); + atomic_read(&conf->r5c_cached_full_stripes) - + flushing_full - flushing_partial; if (total_cached > conf->min_nr_stripes * 3 / 4 || atomic_read(&conf->empty_inactive_list_nr) > 0) @@ -1328,7 +1392,7 @@ static void r5c_do_reclaim(struct r5conf *conf) */ stripes_to_flush = R5C_RECLAIM_STRIPE_GROUP; else if (total_cached > conf->min_nr_stripes * 1 / 2 || - atomic_read(&conf->r5c_cached_full_stripes) > + atomic_read(&conf->r5c_cached_full_stripes) - flushing_full > R5C_FULL_STRIPE_FLUSH_BATCH) /* * if stripe cache pressure moderate, or if there is many full @@ -1362,9 +1426,9 @@ static void r5c_do_reclaim(struct r5conf *conf) !test_bit(STRIPE_HANDLE, &sh->state) && atomic_read(&sh->count) == 0) { r5c_flush_stripe(conf, sh); + if (count++ >= R5C_RECLAIM_STRIPE_GROUP) + break; } - if (count++ >= R5C_RECLAIM_STRIPE_GROUP) - break; } spin_unlock(&conf->device_lock); spin_unlock_irqrestore(&log->stripe_in_journal_lock, flags); @@ -2320,6 +2384,10 @@ int r5c_try_caching_write(struct r5conf *conf, int i; struct r5dev *dev; int to_cache = 0; + void **pslot; + sector_t tree_index; + int ret; + uintptr_t refcount; BUG_ON(!r5c_is_writeback(log)); @@ -2364,6 +2432,44 @@ int r5c_try_caching_write(struct r5conf *conf, } } + /* if the stripe is not counted in big_stripe_tree, add it now */ + if (!test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) && + !test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { + tree_index = r5c_tree_index(conf, sh->sector); + spin_lock(&log->tree_lock); + pslot = radix_tree_lookup_slot(&log->big_stripe_tree, + tree_index); + if (pslot) { + refcount = (uintptr_t)radix_tree_deref_slot_protected( + pslot, &log->tree_lock) >> + R5C_RADIX_COUNT_SHIFT; + radix_tree_replace_slot( + &log->big_stripe_tree, pslot, + (void *)((refcount + 1) << R5C_RADIX_COUNT_SHIFT)); + } else { + /* + * this radix_tree_insert can fail safely, so no + * need to call radix_tree_preload() + */ + ret = radix_tree_insert( + &log->big_stripe_tree, tree_index, + (void *)(1 << R5C_RADIX_COUNT_SHIFT)); + if (ret) { + spin_unlock(&log->tree_lock); + r5c_make_stripe_write_out(sh); + return -EAGAIN; + } + } + spin_unlock(&log->tree_lock); + + /* + * set STRIPE_R5C_PARTIAL_STRIPE, this shows the stripe is + * counted in the radix tree + */ + set_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state); + atomic_inc(&conf->r5c_cached_partial_stripes); + } + for (i = disks; i--; ) { dev = &sh->dev[i]; if (dev->towrite) { @@ -2438,17 +2544,20 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, struct stripe_head *sh, struct stripe_head_state *s) { + struct r5l_log *log = conf->log; int i; int do_wakeup = 0; + sector_t tree_index; + void **pslot; + uintptr_t refcount; - if (!conf->log || - !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) + if (!log || !test_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags)) return; WARN_ON(test_bit(STRIPE_R5C_CACHING, &sh->state)); clear_bit(R5_InJournal, &sh->dev[sh->pd_idx].flags); - if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) + if (log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH) return; for (i = sh->disks; i--; ) { @@ -2470,12 +2579,45 @@ void r5c_finish_stripe_write_out(struct r5conf *conf, if (do_wakeup) wake_up(&conf->wait_for_overlap); - spin_lock_irq(&conf->log->stripe_in_journal_lock); + spin_lock_irq(&log->stripe_in_journal_lock); list_del_init(&sh->r5c); - spin_unlock_irq(&conf->log->stripe_in_journal_lock); + spin_unlock_irq(&log->stripe_in_journal_lock); sh->log_start = MaxSector; - atomic_dec(&conf->log->stripe_in_journal_count); - r5c_update_log_state(conf->log); + + atomic_dec(&log->stripe_in_journal_count); + r5c_update_log_state(log); + + /* stop counting this stripe in big_stripe_tree */ + if (test_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state) || + test_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { + tree_index = r5c_tree_index(conf, sh->sector); + spin_lock(&log->tree_lock); + pslot = radix_tree_lookup_slot(&log->big_stripe_tree, + tree_index); + BUG_ON(pslot == NULL); + refcount = (uintptr_t)radix_tree_deref_slot_protected( + pslot, &log->tree_lock) >> + R5C_RADIX_COUNT_SHIFT; + if (refcount == 1) + radix_tree_delete(&log->big_stripe_tree, tree_index); + else + radix_tree_replace_slot( + &log->big_stripe_tree, pslot, + (void *)((refcount - 1) << R5C_RADIX_COUNT_SHIFT)); + spin_unlock(&log->tree_lock); + } + + if (test_and_clear_bit(STRIPE_R5C_PARTIAL_STRIPE, &sh->state)) { + BUG_ON(atomic_read(&conf->r5c_cached_partial_stripes) == 0); + atomic_dec(&conf->r5c_flushing_partial_stripes); + atomic_dec(&conf->r5c_cached_partial_stripes); + } + + if (test_and_clear_bit(STRIPE_R5C_FULL_STRIPE, &sh->state)) { + BUG_ON(atomic_read(&conf->r5c_cached_full_stripes) == 0); + atomic_dec(&conf->r5c_flushing_full_stripes); + atomic_dec(&conf->r5c_cached_full_stripes); + } } int @@ -2535,6 +2677,22 @@ r5c_cache_data(struct r5l_log *log, struct stripe_head *sh, return 0; } +/* check whether this big stripe is in write back cache. */ +bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect) +{ + struct r5l_log *log = conf->log; + sector_t tree_index; + void *slot; + + if (!log) + return false; + + WARN_ON_ONCE(!rcu_read_lock_held()); + tree_index = r5c_tree_index(conf, sect); + slot = radix_tree_lookup(&log->big_stripe_tree, tree_index); + return slot != NULL; +} + static int r5l_load_log(struct r5l_log *log) { struct md_rdev *rdev = log->rdev; @@ -2681,6 +2839,9 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev) if (!log->meta_pool) goto out_mempool; + spin_lock_init(&log->tree_lock); + INIT_RADIX_TREE(&log->big_stripe_tree, GFP_NOWAIT | __GFP_NOWARN); + log->reclaim_thread = md_register_thread(r5l_reclaim_thread, log->rdev->mddev, "reclaim"); if (!log->reclaim_thread) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 6214e699342c..2ce23b01dbb2 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -281,13 +281,13 @@ static void do_release_stripe(struct r5conf *conf, struct stripe_head *sh, atomic_dec(&conf->r5c_cached_partial_stripes); list_add_tail(&sh->lru, &conf->r5c_full_stripe_list); r5c_check_cached_full_stripe(conf); - } else { - /* partial stripe */ - if (!test_and_set_bit(STRIPE_R5C_PARTIAL_STRIPE, - &sh->state)) - atomic_inc(&conf->r5c_cached_partial_stripes); + } else + /* + * STRIPE_R5C_PARTIAL_STRIPE is set in + * r5c_try_caching_write(). No need to + * set it again. + */ list_add_tail(&sh->lru, &conf->r5c_partial_stripe_list); - } } } } @@ -353,17 +353,15 @@ static void release_inactive_stripe_list(struct r5conf *conf, static int release_stripe_list(struct r5conf *conf, struct list_head *temp_inactive_list) { - struct stripe_head *sh; + struct stripe_head *sh, *t; int count = 0; struct llist_node *head; head = llist_del_all(&conf->released_stripes); head = llist_reverse_order(head); - while (head) { + llist_for_each_entry_safe(sh, t, head, release_list) { int hash; - sh = llist_entry(head, struct stripe_head, release_list); - head = llist_next(head); /* sh could be readded after STRIPE_ON_RELEASE_LIST is cleard */ smp_mb(); clear_bit(STRIPE_ON_RELEASE_LIST, &sh->state); @@ -863,6 +861,43 @@ static int use_new_offset(struct r5conf *conf, struct stripe_head *sh) return 1; } +static void flush_deferred_bios(struct r5conf *conf) +{ + struct bio_list tmp; + struct bio *bio; + + if (!conf->batch_bio_dispatch || !conf->group_cnt) + return; + + bio_list_init(&tmp); + spin_lock(&conf->pending_bios_lock); + bio_list_merge(&tmp, &conf->pending_bios); + bio_list_init(&conf->pending_bios); + spin_unlock(&conf->pending_bios_lock); + + while ((bio = bio_list_pop(&tmp))) + generic_make_request(bio); +} + +static void defer_bio_issue(struct r5conf *conf, struct bio *bio) +{ + /* + * change group_cnt will drain all bios, so this is safe + * + * A read generally means a read-modify-write, which usually means a + * randwrite, so we don't delay it + */ + if (!conf->batch_bio_dispatch || !conf->group_cnt || + bio_op(bio) == REQ_OP_READ) { + generic_make_request(bio); + return; + } + spin_lock(&conf->pending_bios_lock); + bio_list_add(&conf->pending_bios, bio); + spin_unlock(&conf->pending_bios_lock); + md_wakeup_thread(conf->mddev->thread); +} + static void raid5_end_read_request(struct bio *bi); static void @@ -1043,7 +1078,7 @@ again: trace_block_bio_remap(bdev_get_queue(bi->bi_bdev), bi, disk_devt(conf->mddev->gendisk), sh->dev[i].sector); - generic_make_request(bi); + defer_bio_issue(conf, bi); } if (rrdev) { if (s->syncing || s->expanding || s->expanded @@ -1088,7 +1123,7 @@ again: trace_block_bio_remap(bdev_get_queue(rbi->bi_bdev), rbi, disk_devt(conf->mddev->gendisk), sh->dev[i].sector); - generic_make_request(rbi); + defer_bio_issue(conf, rbi); } if (!rdev && !rrdev) { if (op_is_write(op)) @@ -2914,12 +2949,36 @@ sector_t raid5_compute_blocknr(struct stripe_head *sh, int i, int previous) * like to flush data in journal to RAID disks first, so complex rmw * is handled in the write patch (handle_stripe_dirtying). * + * 2. when journal space is critical (R5C_LOG_CRITICAL=1) + * + * It is important to be able to flush all stripes in raid5-cache. + * Therefore, we need reserve some space on the journal device for + * these flushes. If flush operation includes pending writes to the + * stripe, we need to reserve (conf->raid_disk + 1) pages per stripe + * for the flush out. If we exclude these pending writes from flush + * operation, we only need (conf->max_degraded + 1) pages per stripe. + * Therefore, excluding pending writes in these cases enables more + * efficient use of the journal device. + * + * Note: To make sure the stripe makes progress, we only delay + * towrite for stripes with data already in journal (injournal > 0). + * When LOG_CRITICAL, stripes with injournal == 0 will be sent to + * no_space_stripes list. + * */ -static inline bool delay_towrite(struct r5dev *dev, - struct stripe_head_state *s) +static inline bool delay_towrite(struct r5conf *conf, + struct r5dev *dev, + struct stripe_head_state *s) { - return !test_bit(R5_OVERWRITE, &dev->flags) && - !test_bit(R5_Insync, &dev->flags) && s->injournal; + /* case 1 above */ + if (!test_bit(R5_OVERWRITE, &dev->flags) && + !test_bit(R5_Insync, &dev->flags) && s->injournal) + return true; + /* case 2 above */ + if (test_bit(R5C_LOG_CRITICAL, &conf->cache_state) && + s->injournal > 0) + return true; + return false; } static void @@ -2942,7 +3001,7 @@ schedule_reconstruction(struct stripe_head *sh, struct stripe_head_state *s, for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (dev->towrite && !delay_towrite(dev, s)) { + if (dev->towrite && !delay_towrite(conf, dev, s)) { set_bit(R5_LOCKED, &dev->flags); set_bit(R5_Wantdrain, &dev->flags); if (!expand) @@ -3694,7 +3753,7 @@ static int handle_stripe_dirtying(struct r5conf *conf, } else for (i = disks; i--; ) { /* would I have to read this buffer for read_modify_write */ struct r5dev *dev = &sh->dev[i]; - if (((dev->towrite && !delay_towrite(dev, s)) || + if (((dev->towrite && !delay_towrite(conf, dev, s)) || i == sh->pd_idx || i == sh->qd_idx || test_bit(R5_InJournal, &dev->flags)) && !test_bit(R5_LOCKED, &dev->flags) && @@ -3718,8 +3777,8 @@ static int handle_stripe_dirtying(struct r5conf *conf, } } - pr_debug("for sector %llu, rmw=%d rcw=%d\n", - (unsigned long long)sh->sector, rmw, rcw); + pr_debug("for sector %llu state 0x%lx, rmw=%d rcw=%d\n", + (unsigned long long)sh->sector, sh->state, rmw, rcw); set_bit(STRIPE_HANDLE, &sh->state); if ((rmw < rcw || (rmw == rcw && conf->rmw_level == PARITY_PREFER_RMW)) && rmw > 0) { /* prefer read-modify-write, but need to get some data */ @@ -3759,7 +3818,7 @@ static int handle_stripe_dirtying(struct r5conf *conf, for (i = disks; i--; ) { struct r5dev *dev = &sh->dev[i]; - if (((dev->towrite && !delay_towrite(dev, s)) || + if (((dev->towrite && !delay_towrite(conf, dev, s)) || i == sh->pd_idx || i == sh->qd_idx || test_bit(R5_InJournal, &dev->flags)) && !test_bit(R5_LOCKED, &dev->flags) && @@ -4995,9 +5054,9 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) return 0; } /* - * use bio_clone_mddev to make a copy of the bio + * use bio_clone_fast to make a copy of the bio */ - align_bi = bio_clone_mddev(raid_bio, GFP_NOIO, mddev); + align_bi = bio_clone_fast(raid_bio, GFP_NOIO, mddev->bio_set); if (!align_bi) return 0; /* @@ -5025,6 +5084,13 @@ static int raid5_read_one_chunk(struct mddev *mddev, struct bio *raid_bio) rdev->recovery_offset >= end_sector))) rdev = NULL; } + + if (r5c_big_stripe_cached(conf, align_bi->bi_iter.bi_sector)) { + rcu_read_unlock(); + bio_put(align_bi); + return 0; + } + if (rdev) { sector_t first_bad; int bad_sectors; @@ -5381,7 +5447,6 @@ static void raid5_make_request(struct mddev *mddev, struct bio * bi) * data on failed drives. */ if (rw == READ && mddev->degraded == 0 && - !r5c_is_writeback(conf->log) && mddev->reshape_position == MaxSector) { bi = chunk_aligned_read(mddev, bi); if (!bi) @@ -6126,6 +6191,8 @@ static void raid5d(struct md_thread *thread) mutex_unlock(&conf->cache_size_mutex); } + flush_deferred_bios(conf); + r5l_flush_stripe_to_raid(conf->log); async_tx_issue_pending_all(); @@ -6711,6 +6778,18 @@ static struct r5conf *setup_conf(struct mddev *mddev) atomic_set(&conf->active_stripes, 0); atomic_set(&conf->preread_active_stripes, 0); atomic_set(&conf->active_aligned_reads, 0); + bio_list_init(&conf->pending_bios); + spin_lock_init(&conf->pending_bios_lock); + conf->batch_bio_dispatch = true; + rdev_for_each(rdev, mddev) { + if (test_bit(Journal, &rdev->flags)) + continue; + if (blk_queue_nonrot(bdev_get_queue(rdev->bdev))) { + conf->batch_bio_dispatch = false; + break; + } + } + conf->bypass_threshold = BYPASS_THRESHOLD; conf->recovery_disabled = mddev->recovery_disabled - 1; @@ -6757,6 +6836,8 @@ static struct r5conf *setup_conf(struct mddev *mddev) INIT_LIST_HEAD(&conf->r5c_full_stripe_list); atomic_set(&conf->r5c_cached_partial_stripes, 0); INIT_LIST_HEAD(&conf->r5c_partial_stripe_list); + atomic_set(&conf->r5c_flushing_full_stripes, 0); + atomic_set(&conf->r5c_flushing_partial_stripes, 0); conf->level = mddev->new_level; conf->chunk_sectors = mddev->new_chunk_sectors; diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 1440fa26e296..4bb27b97bf6b 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -663,6 +663,8 @@ struct r5conf { struct list_head r5c_full_stripe_list; atomic_t r5c_cached_partial_stripes; struct list_head r5c_partial_stripe_list; + atomic_t r5c_flushing_full_stripes; + atomic_t r5c_flushing_partial_stripes; atomic_t empty_inactive_list_nr; struct llist_head released_stripes; @@ -684,6 +686,10 @@ struct r5conf { int group_cnt; int worker_cnt_per_group; struct r5l_log *log; + + struct bio_list pending_bios; + spinlock_t pending_bios_lock; + bool batch_bio_dispatch; }; @@ -788,4 +794,5 @@ extern void r5c_check_stripe_cache_usage(struct r5conf *conf); extern void r5c_check_cached_full_stripe(struct r5conf *conf); extern struct md_sysfs_entry r5c_journal_mode; extern void r5c_update_on_rdev_error(struct mddev *mddev); +extern bool r5c_big_stripe_cached(struct r5conf *conf, sector_t sect); #endif diff --git a/include/linux/bio.h b/include/linux/bio.h index 7cf8a6c70a3f..8e521194f6fc 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -183,7 +183,7 @@ static inline void bio_advance_iter(struct bio *bio, struct bvec_iter *iter, #define bio_iter_last(bvec, iter) ((iter).bi_size == (bvec).bv_len) -static inline unsigned bio_segments(struct bio *bio) +static inline unsigned __bio_segments(struct bio *bio, struct bvec_iter *bvec) { unsigned segs = 0; struct bio_vec bv; @@ -205,12 +205,17 @@ static inline unsigned bio_segments(struct bio *bio) break; } - bio_for_each_segment(bv, bio, iter) + __bio_for_each_segment(bv, bio, iter, *bvec) segs++; return segs; } +static inline unsigned bio_segments(struct bio *bio) +{ + return __bio_segments(bio, &bio->bi_iter); +} + /* * get a reference to a bio, so it won't disappear. the intended use is * something like: @@ -384,6 +389,8 @@ extern void bio_put(struct bio *); extern void __bio_clone_fast(struct bio *, struct bio *); extern struct bio *bio_clone_fast(struct bio *, gfp_t, struct bio_set *); extern struct bio *bio_clone_bioset(struct bio *, gfp_t, struct bio_set *bs); +extern struct bio *bio_clone_bioset_partial(struct bio *, gfp_t, + struct bio_set *, int, int); extern struct bio_set *fs_bio_set; diff --git a/lib/radix-tree.c b/lib/radix-tree.c index 84812a9fb16f..72fab4999c00 100644 --- a/lib/radix-tree.c +++ b/lib/radix-tree.c @@ -1102,6 +1102,7 @@ void radix_tree_replace_slot(struct radix_tree_root *root, { replace_slot(root, NULL, slot, item, true); } +EXPORT_SYMBOL(radix_tree_replace_slot); /** * radix_tree_iter_replace - replace item in a slot |