From d1901ef099c38afd11add4cfb3312c02ef21ec4a Mon Sep 17 00:00:00 2001 From: Tomáš Hodek Date: Mon, 23 Feb 2015 11:00:38 +1100 Subject: md/raid1: fix read balance when a drive is write-mostly. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a drive is marked write-mostly it should only be the target of reads if there is no other option. This behaviour was broken by commit 9dedf60313fa4dddfd5b9b226a0ef12a512bf9dc md/raid1: read balance chooses idlest disk for SSD which causes a write-mostly device to be *preferred* is some cases. Restore correct behaviour by checking and setting best_dist_disk and best_pending_disk rather than best_disk. We only need to test one of these as they are both changed from -1 or >=0 at the same time. As we leave min_pending and best_dist unchanged, any non-write-mostly device will appear better than the write-mostly device. Reported-by: Tomáš Hodek Reported-by: Dark Penguin Signed-off-by: NeilBrown Link: http://marc.info/?l=linux-raid&m=135982797322422 Fixes: 9dedf60313fa4dddfd5b9b226a0ef12a512bf9dc Cc: stable@vger.kernel.org (3.6+) --- drivers/md/raid1.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 4153da5d4011..d34e238afa54 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -560,7 +560,7 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect if (test_bit(WriteMostly, &rdev->flags)) { /* Don't balance among write-mostly, just * use the first as a last resort */ - if (best_disk < 0) { + if (best_dist_disk < 0) { if (is_badblock(rdev, this_sector, sectors, &first_bad, &bad_sectors)) { if (first_bad < this_sector) @@ -569,7 +569,8 @@ static int read_balance(struct r1conf *conf, struct r1bio *r1_bio, int *max_sect best_good_sectors = first_bad - this_sector; } else best_good_sectors = sectors; - best_disk = disk; + best_dist_disk = disk; + best_pending_disk = disk; } continue; } -- cgit v1.2.3 From 16d9cfab930bb6f4946cff8ba7429701fd15b414 Mon Sep 17 00:00:00 2001 From: Eric Mei Date: Tue, 6 Jan 2015 09:35:02 -0800 Subject: raid5: check faulty flag for array status during recovery. When we have more than 1 drive failure, it's possible we start rebuild one drive while leaving another faulty drive in array. To determine whether array will be optimal after building, current code only check whether a drive is missing, which could potentially lead to data corruption. This patch is to add checking Faulty flag. Signed-off-by: NeilBrown --- drivers/md/raid5.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index e75d48c0421a..cd2f96b2c572 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5121,12 +5121,17 @@ static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int schedule_timeout_uninterruptible(1); } /* Need to check if array will still be degraded after recovery/resync - * We don't need to check the 'failed' flag as when that gets set, - * recovery aborts. + * Note in case of > 1 drive failures it's possible we're rebuilding + * one drive while leaving another faulty drive in array. */ - for (i = 0; i < conf->raid_disks; i++) - if (conf->disks[i].rdev == NULL) + rcu_read_lock(); + for (i = 0; i < conf->raid_disks; i++) { + struct md_rdev *rdev = ACCESS_ONCE(conf->disks[i].rdev); + + if (rdev == NULL || test_bit(Faulty, &rdev->flags)) still_degraded = 1; + } + rcu_read_unlock(); bitmap_start_sync(mddev->bitmap, sector_nr, &sync_blocks, still_degraded); -- cgit v1.2.3 From 750f199ee8b578062341e6ddfe36c59ac8ff2dcb Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Tue, 30 Sep 2014 08:53:05 +1000 Subject: md: mark some attributes as pre-alloc Since __ATTR_PREALLOC was introduced in v3.19-rc1~78^2~18 it can now be used by md. This ensure that writing to these sysfs attributes will never block due to a memory allocation. Such blocking could become a deadlock if mdmon is trying to reconfigure an array after a failure prior to re-enabling writes. Signed-off-by: NeilBrown --- drivers/md/md.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/md/md.c b/drivers/md/md.c index c8d2bac4e28b..cadf9cc02b25 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2555,7 +2555,7 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len) return err ? err : len; } static struct rdev_sysfs_entry rdev_state = -__ATTR(state, S_IRUGO|S_IWUSR, state_show, state_store); +__ATTR_PREALLOC(state, S_IRUGO|S_IWUSR, state_show, state_store); static ssize_t errors_show(struct md_rdev *rdev, char *page) @@ -3638,7 +3638,8 @@ resync_start_store(struct mddev *mddev, const char *buf, size_t len) return err ?: len; } static struct md_sysfs_entry md_resync_start = -__ATTR(resync_start, S_IRUGO|S_IWUSR, resync_start_show, resync_start_store); +__ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR, + resync_start_show, resync_start_store); /* * The array state can be: @@ -3851,7 +3852,7 @@ array_state_store(struct mddev *mddev, const char *buf, size_t len) return err ?: len; } static struct md_sysfs_entry md_array_state = -__ATTR(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); +__ATTR_PREALLOC(array_state, S_IRUGO|S_IWUSR, array_state_show, array_state_store); static ssize_t max_corrected_read_errors_show(struct mddev *mddev, char *page) { @@ -4101,7 +4102,7 @@ out_unlock: } static struct md_sysfs_entry md_metadata = -__ATTR(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); +__ATTR_PREALLOC(metadata_version, S_IRUGO|S_IWUSR, metadata_show, metadata_store); static ssize_t action_show(struct mddev *mddev, char *page) @@ -4189,7 +4190,7 @@ action_store(struct mddev *mddev, const char *page, size_t len) } static struct md_sysfs_entry md_scan_mode = -__ATTR(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); +__ATTR_PREALLOC(sync_action, S_IRUGO|S_IWUSR, action_show, action_store); static ssize_t last_sync_action_show(struct mddev *mddev, char *page) @@ -4335,7 +4336,8 @@ sync_completed_show(struct mddev *mddev, char *page) return sprintf(page, "%llu / %llu\n", resync, max_sectors); } -static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed); +static struct md_sysfs_entry md_sync_completed = + __ATTR_PREALLOC(sync_completed, S_IRUGO, sync_completed_show, NULL); static ssize_t min_sync_show(struct mddev *mddev, char *page) -- cgit v1.2.3