summaryrefslogtreecommitdiff
path: root/drivers/md/md.c
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/md/md.c')
-rw-r--r--drivers/md/md.c536
1 files changed, 299 insertions, 237 deletions
diff --git a/drivers/md/md.c b/drivers/md/md.c
index f20d13e717d..b76cfc89e1b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -36,7 +36,7 @@
#include <linux/blkdev.h>
#include <linux/sysctl.h>
#include <linux/seq_file.h>
-#include <linux/smp_lock.h>
+#include <linux/mutex.h>
#include <linux/buffer_head.h> /* for invalidate_bdev */
#include <linux/poll.h>
#include <linux/ctype.h>
@@ -57,7 +57,6 @@
#define DEBUG 0
#define dprintk(x...) ((void)(DEBUG && printk(x)))
-
#ifndef MODULE
static void autostart_arrays(int part);
#endif
@@ -68,6 +67,8 @@ static DEFINE_SPINLOCK(pers_lock);
static void md_print_devices(void);
static DECLARE_WAIT_QUEUE_HEAD(resync_wait);
+static struct workqueue_struct *md_wq;
+static struct workqueue_struct *md_misc_wq;
#define MD_BUG(x...) { printk("md: bug in file %s, line %d\n", __FILE__, __LINE__); md_print_devices(); }
@@ -148,6 +149,72 @@ static const struct block_device_operations md_fops;
static int start_readonly;
+/* bio_clone_mddev
+ * like bio_clone, but with a local bio set
+ */
+
+static void mddev_bio_destructor(struct bio *bio)
+{
+ mddev_t *mddev, **mddevp;
+
+ mddevp = (void*)bio;
+ mddev = mddevp[-1];
+
+ bio_free(bio, mddev->bio_set);
+}
+
+struct bio *bio_alloc_mddev(gfp_t gfp_mask, int nr_iovecs,
+ mddev_t *mddev)
+{
+ struct bio *b;
+ mddev_t **mddevp;
+
+ if (!mddev || !mddev->bio_set)
+ return bio_alloc(gfp_mask, nr_iovecs);
+
+ b = bio_alloc_bioset(gfp_mask, nr_iovecs,
+ mddev->bio_set);
+ if (!b)
+ return NULL;
+ mddevp = (void*)b;
+ mddevp[-1] = mddev;
+ b->bi_destructor = mddev_bio_destructor;
+ return b;
+}
+EXPORT_SYMBOL_GPL(bio_alloc_mddev);
+
+struct bio *bio_clone_mddev(struct bio *bio, gfp_t gfp_mask,
+ mddev_t *mddev)
+{
+ struct bio *b;
+ mddev_t **mddevp;
+
+ if (!mddev || !mddev->bio_set)
+ return bio_clone(bio, gfp_mask);
+
+ b = bio_alloc_bioset(gfp_mask, bio->bi_max_vecs,
+ mddev->bio_set);
+ if (!b)
+ return NULL;
+ mddevp = (void*)b;
+ mddevp[-1] = mddev;
+ b->bi_destructor = mddev_bio_destructor;
+ __bio_clone(b, bio);
+ if (bio_integrity(bio)) {
+ int ret;
+
+ ret = bio_integrity_clone(b, bio, gfp_mask, mddev->bio_set);
+
+ if (ret < 0) {
+ bio_put(b);
+ return NULL;
+ }
+ }
+
+ return b;
+}
+EXPORT_SYMBOL_GPL(bio_clone_mddev);
+
/*
* We have a system wide 'event count' that is incremented
* on any 'interesting' event, and readers of /proc/mdstat
@@ -221,17 +288,19 @@ static int md_make_request(struct request_queue *q, struct bio *bio)
int rv;
int cpu;
- if (mddev == NULL || mddev->pers == NULL) {
+ if (mddev == NULL || mddev->pers == NULL
+ || !mddev->ready) {
bio_io_error(bio);
return 0;
}
+ smp_rmb(); /* Ensure implications of 'active' are visible */
rcu_read_lock();
- if (mddev->suspended || mddev->barrier) {
+ if (mddev->suspended) {
DEFINE_WAIT(__wait);
for (;;) {
prepare_to_wait(&mddev->sb_wait, &__wait,
TASK_UNINTERRUPTIBLE);
- if (!mddev->suspended && !mddev->barrier)
+ if (!mddev->suspended)
break;
rcu_read_unlock();
schedule();
@@ -282,43 +351,37 @@ EXPORT_SYMBOL_GPL(mddev_resume);
int mddev_congested(mddev_t *mddev, int bits)
{
- if (mddev->barrier)
- return 1;
return mddev->suspended;
}
EXPORT_SYMBOL(mddev_congested);
/*
- * Generic barrier handling for md
+ * Generic flush handling for md
*/
-#define POST_REQUEST_BARRIER ((void*)1)
-
-static void md_end_barrier(struct bio *bio, int err)
+static void md_end_flush(struct bio *bio, int err)
{
mdk_rdev_t *rdev = bio->bi_private;
mddev_t *mddev = rdev->mddev;
- if (err == -EOPNOTSUPP && mddev->barrier != POST_REQUEST_BARRIER)
- set_bit(BIO_EOPNOTSUPP, &mddev->barrier->bi_flags);
rdev_dec_pending(rdev, mddev);
if (atomic_dec_and_test(&mddev->flush_pending)) {
- if (mddev->barrier == POST_REQUEST_BARRIER) {
- /* This was a post-request barrier */
- mddev->barrier = NULL;
- wake_up(&mddev->sb_wait);
- } else
- /* The pre-request barrier has finished */
- schedule_work(&mddev->barrier_work);
+ /* The pre-request flush has finished */
+ queue_work(md_wq, &mddev->flush_work);
}
bio_put(bio);
}
-static void submit_barriers(mddev_t *mddev)
+static void md_submit_flush_data(struct work_struct *ws);
+
+static void submit_flushes(struct work_struct *ws)
{
+ mddev_t *mddev = container_of(ws, mddev_t, flush_work);
mdk_rdev_t *rdev;
+ INIT_WORK(&mddev->flush_work, md_submit_flush_data);
+ atomic_set(&mddev->flush_pending, 1);
rcu_read_lock();
list_for_each_entry_rcu(rdev, &mddev->disks, same_set)
if (rdev->raid_disk >= 0 &&
@@ -331,61 +394,51 @@ static void submit_barriers(mddev_t *mddev)
atomic_inc(&rdev->nr_pending);
atomic_inc(&rdev->nr_pending);
rcu_read_unlock();
- bi = bio_alloc(GFP_KERNEL, 0);
- bi->bi_end_io = md_end_barrier;
+ bi = bio_alloc_mddev(GFP_KERNEL, 0, mddev);
+ bi->bi_end_io = md_end_flush;
bi->bi_private = rdev;
bi->bi_bdev = rdev->bdev;
atomic_inc(&mddev->flush_pending);
- submit_bio(WRITE_BARRIER, bi);
+ submit_bio(WRITE_FLUSH, bi);
rcu_read_lock();
rdev_dec_pending(rdev, mddev);
}
rcu_read_unlock();
+ if (atomic_dec_and_test(&mddev->flush_pending))
+ queue_work(md_wq, &mddev->flush_work);
}
-static void md_submit_barrier(struct work_struct *ws)
+static void md_submit_flush_data(struct work_struct *ws)
{
- mddev_t *mddev = container_of(ws, mddev_t, barrier_work);
- struct bio *bio = mddev->barrier;
+ mddev_t *mddev = container_of(ws, mddev_t, flush_work);
+ struct bio *bio = mddev->flush_bio;
- atomic_set(&mddev->flush_pending, 1);
-
- if (test_bit(BIO_EOPNOTSUPP, &bio->bi_flags))
- bio_endio(bio, -EOPNOTSUPP);
- else if (bio->bi_size == 0)
+ if (bio->bi_size == 0)
/* an empty barrier - all done */
bio_endio(bio, 0);
else {
- bio->bi_rw &= ~REQ_HARDBARRIER;
+ bio->bi_rw &= ~REQ_FLUSH;
if (mddev->pers->make_request(mddev, bio))
generic_make_request(bio);
- mddev->barrier = POST_REQUEST_BARRIER;
- submit_barriers(mddev);
- }
- if (atomic_dec_and_test(&mddev->flush_pending)) {
- mddev->barrier = NULL;
- wake_up(&mddev->sb_wait);
}
+
+ mddev->flush_bio = NULL;
+ wake_up(&mddev->sb_wait);
}
-void md_barrier_request(mddev_t *mddev, struct bio *bio)
+void md_flush_request(mddev_t *mddev, struct bio *bio)
{
spin_lock_irq(&mddev->write_lock);
wait_event_lock_irq(mddev->sb_wait,
- !mddev->barrier,
+ !mddev->flush_bio,
mddev->write_lock, /*nothing*/);
- mddev->barrier = bio;
+ mddev->flush_bio = bio;
spin_unlock_irq(&mddev->write_lock);
- atomic_set(&mddev->flush_pending, 1);
- INIT_WORK(&mddev->barrier_work, md_submit_barrier);
-
- submit_barriers(mddev);
-
- if (atomic_dec_and_test(&mddev->flush_pending))
- schedule_work(&mddev->barrier_work);
+ INIT_WORK(&mddev->flush_work, submit_flushes);
+ queue_work(md_wq, &mddev->flush_work);
}
-EXPORT_SYMBOL(md_barrier_request);
+EXPORT_SYMBOL(md_flush_request);
/* Support for plugging.
* This mirrors the plugging support in request_queue, but does not
@@ -442,6 +495,8 @@ static void mddev_delayed_delete(struct work_struct *ws);
static void mddev_put(mddev_t *mddev)
{
+ struct bio_set *bs = NULL;
+
if (!atomic_dec_and_lock(&mddev->active, &all_mddevs_lock))
return;
if (!mddev->raid_disks && list_empty(&mddev->disks) &&
@@ -449,19 +504,22 @@ static void mddev_put(mddev_t *mddev)
/* Array is not configured at all, and not held active,
* so destroy it */
list_del(&mddev->all_mddevs);
+ bs = mddev->bio_set;
+ mddev->bio_set = NULL;
if (mddev->gendisk) {
- /* we did a probe so need to clean up.
- * Call schedule_work inside the spinlock
- * so that flush_scheduled_work() after
- * mddev_find will succeed in waiting for the
- * work to be done.
+ /* We did a probe so need to clean up. Call
+ * queue_work inside the spinlock so that
+ * flush_workqueue() after mddev_find will
+ * succeed in waiting for the work to be done.
*/
INIT_WORK(&mddev->del_work, mddev_delayed_delete);
- schedule_work(&mddev->del_work);
+ queue_work(md_misc_wq, &mddev->del_work);
} else
kfree(mddev);
}
spin_unlock(&all_mddevs_lock);
+ if (bs)
+ bioset_free(bs);
}
void mddev_init(mddev_t *mddev)
@@ -647,9 +705,9 @@ static struct mdk_personality *find_pers(int level, char *clevel)
}
/* return the offset of the super block in 512byte sectors */
-static inline sector_t calc_dev_sboffset(struct block_device *bdev)
+static inline sector_t calc_dev_sboffset(mdk_rdev_t *rdev)
{
- sector_t num_sectors = bdev->bd_inode->i_size / 512;
+ sector_t num_sectors = i_size_read(rdev->bdev->bd_inode) / 512;
return MD_NEW_SIZE_SECTORS(num_sectors);
}
@@ -696,31 +754,6 @@ static void super_written(struct bio *bio, int error)
bio_put(bio);
}
-static void super_written_barrier(struct bio *bio, int error)
-{
- struct bio *bio2 = bio->bi_private;
- mdk_rdev_t *rdev = bio2->bi_private;
- mddev_t *mddev = rdev->mddev;
-
- if (!test_bit(BIO_UPTODATE, &bio->bi_flags) &&
- error == -EOPNOTSUPP) {
- unsigned long flags;
- /* barriers don't appear to be supported :-( */
- set_bit(BarriersNotsupp, &rdev->flags);
- mddev->barriers_work = 0;
- spin_lock_irqsave(&mddev->write_lock, flags);
- bio2->bi_next = mddev->biolist;
- mddev->biolist = bio2;
- spin_unlock_irqrestore(&mddev->write_lock, flags);
- wake_up(&mddev->sb_wait);
- bio_put(bio);
- } else {
- bio_put(bio2);
- bio->bi_private = rdev;
- super_written(bio, error);
- }
-}
-
void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
sector_t sector, int size, struct page *page)
{
@@ -729,51 +762,28 @@ void md_super_write(mddev_t *mddev, mdk_rdev_t *rdev,
* and decrement it on completion, waking up sb_wait
* if zero is reached.
* If an error occurred, call md_error
- *
- * As we might need to resubmit the request if REQ_HARDBARRIER
- * causes ENOTSUPP, we allocate a spare bio...
*/
- struct bio *bio = bio_alloc(GFP_NOIO, 1);
- int rw = REQ_WRITE | REQ_SYNC | REQ_UNPLUG;
+ struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, mddev);
- bio->bi_bdev = rdev->bdev;
+ bio->bi_bdev = rdev->meta_bdev ? rdev->meta_bdev : rdev->bdev;
bio->bi_sector = sector;
bio_add_page(bio, page, size, 0);
bio->bi_private = rdev;
bio->bi_end_io = super_written;
- bio->bi_rw = rw;
atomic_inc(&mddev->pending_writes);
- if (!test_bit(BarriersNotsupp, &rdev->flags)) {
- struct bio *rbio;
- rw |= REQ_HARDBARRIER;
- rbio = bio_clone(bio, GFP_NOIO);
- rbio->bi_private = bio;
- rbio->bi_end_io = super_written_barrier;
- submit_bio(rw, rbio);
- } else
- submit_bio(rw, bio);
+ submit_bio(REQ_WRITE | REQ_SYNC | REQ_UNPLUG | REQ_FLUSH | REQ_FUA,
+ bio);
}
void md_super_wait(mddev_t *mddev)
{
- /* wait for all superblock writes that were scheduled to complete.
- * if any had to be retried (due to BARRIER problems), retry them
- */
+ /* wait for all superblock writes that were scheduled to complete */
DEFINE_WAIT(wq);
for(;;) {
prepare_to_wait(&mddev->sb_wait, &wq, TASK_UNINTERRUPTIBLE);
if (atomic_read(&mddev->pending_writes)==0)
break;
- while (mddev->biolist) {
- struct bio *bio;
- spin_lock_irq(&mddev->write_lock);
- bio = mddev->biolist;
- mddev->biolist = bio->bi_next ;
- bio->bi_next = NULL;
- spin_unlock_irq(&mddev->write_lock);
- submit_bio(bio->bi_rw, bio);
- }
schedule();
}
finish_wait(&mddev->sb_wait, &wq);
@@ -784,17 +794,21 @@ static void bi_complete(struct bio *bio, int error)
complete((struct completion*)bio->bi_private);
}
-int sync_page_io(struct block_device *bdev, sector_t sector, int size,
- struct page *page, int rw)
+int sync_page_io(mdk_rdev_t *rdev, sector_t sector, int size,
+ struct page *page, int rw, bool metadata_op)
{
- struct bio *bio = bio_alloc(GFP_NOIO, 1);
+ struct bio *bio = bio_alloc_mddev(GFP_NOIO, 1, rdev->mddev);
struct completion event;
int ret;
rw |= REQ_SYNC | REQ_UNPLUG;
- bio->bi_bdev = bdev;
- bio->bi_sector = sector;
+ bio->bi_bdev = (metadata_op && rdev->meta_bdev) ?
+ rdev->meta_bdev : rdev->bdev;
+ if (metadata_op)
+ bio->bi_sector = sector + rdev->sb_start;
+ else
+ bio->bi_sector = sector + rdev->data_offset;
bio_add_page(bio, page, size, 0);
init_completion(&event);
bio->bi_private = &event;
@@ -819,7 +833,7 @@ static int read_disk_sb(mdk_rdev_t * rdev, int size)
return 0;
- if (!sync_page_io(rdev->bdev, rdev->sb_start, size, rdev->sb_page, READ))
+ if (!sync_page_io(rdev, 0, size, rdev->sb_page, READ, true))
goto fail;
rdev->sb_loaded = 1;
return 0;
@@ -981,7 +995,7 @@ static int super_90_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version
*
* It also happens to be a multiple of 4Kb.
*/
- rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_start = calc_dev_sboffset(rdev);
ret = read_disk_sb(rdev, MD_SB_BYTES);
if (ret) return ret;
@@ -1070,7 +1084,6 @@ static int super_90_validate(mddev_t *mddev, mdk_rdev_t *rdev)
clear_bit(Faulty, &rdev->flags);
clear_bit(In_sync, &rdev->flags);
clear_bit(WriteMostly, &rdev->flags);
- clear_bit(BarriersNotsupp, &rdev->flags);
if (mddev->raid_disks == 0) {
mddev->major_version = 0;
@@ -1323,13 +1336,13 @@ super_90_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
return 0; /* component must fit device */
if (rdev->mddev->bitmap_info.offset)
return 0; /* can't move bitmap */
- rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_start = calc_dev_sboffset(rdev);
if (!num_sectors || num_sectors > rdev->sb_start)
num_sectors = rdev->sb_start;
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
rdev->sb_page);
md_super_wait(rdev->mddev);
- return num_sectors / 2; /* kB for sysfs */
+ return num_sectors;
}
@@ -1378,7 +1391,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
*/
switch(minor_version) {
case 0:
- sb_start = rdev->bdev->bd_inode->i_size >> 9;
+ sb_start = i_size_read(rdev->bdev->bd_inode) >> 9;
sb_start -= 8*2;
sb_start &= ~(sector_t)(4*2-1);
break;
@@ -1464,7 +1477,7 @@ static int super_1_load(mdk_rdev_t *rdev, mdk_rdev_t *refdev, int minor_version)
ret = 0;
}
if (minor_version)
- rdev->sectors = (rdev->bdev->bd_inode->i_size >> 9) -
+ rdev->sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
le64_to_cpu(sb->data_offset);
else
rdev->sectors = rdev->sb_start;
@@ -1485,7 +1498,6 @@ static int super_1_validate(mddev_t *mddev, mdk_rdev_t *rdev)
clear_bit(Faulty, &rdev->flags);
clear_bit(In_sync, &rdev->flags);
clear_bit(WriteMostly, &rdev->flags);
- clear_bit(BarriersNotsupp, &rdev->flags);
if (mddev->raid_disks == 0) {
mddev->major_version = 1;
@@ -1673,7 +1685,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
return 0; /* component must fit device */
if (rdev->sb_start < rdev->data_offset) {
/* minor versions 1 and 2; superblock before data */
- max_sectors = rdev->bdev->bd_inode->i_size >> 9;
+ max_sectors = i_size_read(rdev->bdev->bd_inode) >> 9;
max_sectors -= rdev->data_offset;
if (!num_sectors || num_sectors > max_sectors)
num_sectors = max_sectors;
@@ -1683,7 +1695,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
} else {
/* minor version 0; superblock after data */
sector_t sb_start;
- sb_start = (rdev->bdev->bd_inode->i_size >> 9) - 8*2;
+ sb_start = (i_size_read(rdev->bdev->bd_inode) >> 9) - 8*2;
sb_start &= ~(sector_t)(4*2 - 1);
max_sectors = rdev->sectors + sb_start - rdev->sb_start;
if (!num_sectors || num_sectors > max_sectors)
@@ -1697,7 +1709,7 @@ super_1_rdev_size_change(mdk_rdev_t *rdev, sector_t num_sectors)
md_super_write(rdev->mddev, rdev, rdev->sb_start, rdev->sb_size,
rdev->sb_page);
md_super_wait(rdev->mddev);
- return num_sectors / 2; /* kB for sysfs */
+ return num_sectors;
}
static struct super_type super_types[] = {
@@ -1873,7 +1885,7 @@ static int bind_rdev_to_array(mdk_rdev_t * rdev, mddev_t * mddev)
rdev->sysfs_state = sysfs_get_dirent_safe(rdev->kobj.sd, "state");
list_add_rcu(&rdev->same_set, &mddev->disks);
- bd_claim_by_disk(rdev->bdev, rdev->bdev->bd_holder, mddev->gendisk);
+ bd_link_disk_holder(rdev->bdev, mddev->gendisk);
/* May as well allow recovery to be retried once */
mddev->recovery_disabled = 0;
@@ -1900,7 +1912,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
MD_BUG();
return;
}
- bd_release_from_disk(rdev->bdev, rdev->mddev->gendisk);
+ bd_unlink_disk_holder(rdev->bdev, rdev->mddev->gendisk);
list_del_rcu(&rdev->same_set);
printk(KERN_INFO "md: unbind<%s>\n", bdevname(rdev->bdev,b));
rdev->mddev = NULL;
@@ -1914,7 +1926,7 @@ static void unbind_rdev_from_array(mdk_rdev_t * rdev)
synchronize_rcu();
INIT_WORK(&rdev->del_work, md_delayed_delete);
kobject_get(&rdev->kobj);
- schedule_work(&rdev->del_work);
+ queue_work(md_misc_wq, &rdev->del_work);
}
/*
@@ -1928,19 +1940,13 @@ static int lock_rdev(mdk_rdev_t *rdev, dev_t dev, int shared)
struct block_device *bdev;
char b[BDEVNAME_SIZE];
- bdev = open_by_devnum(dev, FMODE_READ|FMODE_WRITE);
+ bdev = blkdev_get_by_dev(dev, FMODE_READ|FMODE_WRITE|FMODE_EXCL,
+ shared ? (mdk_rdev_t *)lock_rdev : rdev);
if (IS_ERR(bdev)) {
printk(KERN_ERR "md: could not open %s.\n",
__bdevname(dev, b));
return PTR_ERR(bdev);
}
- err = bd_claim(bdev, shared ? (mdk_rdev_t *)lock_rdev : rdev);
- if (err) {
- printk(KERN_ERR "md: could not bd_claim %s.\n",
- bdevname(bdev, b));
- blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
- return err;
- }
if (!shared)
set_bit(AllReserved, &rdev->flags);
rdev->bdev = bdev;
@@ -1953,8 +1959,7 @@ static void unlock_rdev(mdk_rdev_t *rdev)
rdev->bdev = NULL;
if (!bdev)
MD_BUG();
- bd_release(bdev);
- blkdev_put(bdev, FMODE_READ|FMODE_WRITE);
+ blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL);
}
void md_autodetect_dev(dev_t dev);
@@ -2172,6 +2177,8 @@ repeat:
if (!mddev->persistent) {
clear_bit(MD_CHANGE_CLEAN, &mddev->flags);
clear_bit(MD_CHANGE_DEVS, &mddev->flags);
+ if (!mddev->external)
+ clear_bit(MD_CHANGE_PENDING, &mddev->flags);
wake_up(&mddev->sb_wait);
return;
}
@@ -2465,6 +2472,10 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
if (rdev2->raid_disk == slot)
return -EEXIST;
+ if (slot >= rdev->mddev->raid_disks &&
+ slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
+ return -ENOSPC;
+
rdev->raid_disk = slot;
if (test_bit(In_sync, &rdev->flags))
rdev->saved_raid_disk = slot;
@@ -2482,7 +2493,8 @@ slot_store(mdk_rdev_t *rdev, const char *buf, size_t len)
/* failure here is OK */;
/* don't wakeup anyone, leave that to userspace. */
} else {
- if (slot >= rdev->mddev->raid_disks)
+ if (slot >= rdev->mddev->raid_disks &&
+ slot >= rdev->mddev->raid_disks + rdev->mddev->delta_disks)
return -ENOSPC;
rdev->raid_disk = slot;
/* assume it is working */
@@ -2575,7 +2587,7 @@ rdev_size_store(mdk_rdev_t *rdev, const char *buf, size_t len)
if (!sectors)
return -EBUSY;
} else if (!sectors)
- sectors = (rdev->bdev->bd_inode->i_size >> 9) -
+ sectors = (i_size_read(rdev->bdev->bd_inode) >> 9) -
rdev->data_offset;
}
if (sectors < my_mddev->dev_sectors)
@@ -2788,7 +2800,7 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
kobject_init(&rdev->kobj, &rdev_ktype);
- size = rdev->bdev->bd_inode->i_size >> BLOCK_SIZE_BITS;
+ size = i_size_read(rdev->bdev->bd_inode) >> BLOCK_SIZE_BITS;
if (!size) {
printk(KERN_WARNING
"md: %s has zero or unknown size, marking faulty!\n",
@@ -3107,7 +3119,7 @@ level_store(mddev_t *mddev, const char *buf, size_t len)
char nm[20];
if (rdev->raid_disk < 0)
continue;
- if (rdev->new_raid_disk > mddev->raid_disks)
+ if (rdev->new_raid_disk >= mddev->raid_disks)
rdev->new_raid_disk = -1;
if (rdev->new_raid_disk == rdev->raid_disk)
continue;
@@ -3736,6 +3748,8 @@ action_show(mddev_t *mddev, char *page)
return sprintf(page, "%s\n", type);
}
+static void reap_sync_thread(mddev_t *mddev);
+
static ssize_t
action_store(mddev_t *mddev, const char *page, size_t len)
{
@@ -3750,9 +3764,7 @@ action_store(mddev_t *mddev, const char *page, size_t len)
if (cmd_match(page, "idle") || cmd_match(page, "frozen")) {
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_unregister_thread(mddev->sync_thread);
- mddev->sync_thread = NULL;
- mddev->recovery = 0;
+ reap_sync_thread(mddev);
}
} else if (test_bit(MD_RECOVERY_RUNNING, &mddev->recovery) ||
test_bit(MD_RECOVERY_NEEDED, &mddev->recovery))
@@ -3904,7 +3916,7 @@ static struct md_sysfs_entry md_sync_speed = __ATTR_RO(sync_speed);
static ssize_t
sync_completed_show(mddev_t *mddev, char *page)
{
- unsigned long max_sectors, resync;
+ unsigned long long max_sectors, resync;
if (!test_bit(MD_RECOVERY_RUNNING, &mddev->recovery))
return sprintf(page, "none\n");
@@ -3915,7 +3927,7 @@ sync_completed_show(mddev_t *mddev, char *page)
max_sectors = mddev->dev_sectors;
resync = mddev->curr_resync_completed;
- return sprintf(page, "%lu / %lu\n", resync, max_sectors);
+ return sprintf(page, "%llu / %llu\n", resync, max_sectors);
}
static struct md_sysfs_entry md_sync_completed = __ATTR_RO(sync_completed);
@@ -4002,19 +4014,24 @@ suspend_lo_store(mddev_t *mddev, const char *buf, size_t len)
{
char *e;
unsigned long long new = simple_strtoull(buf, &e, 10);
+ unsigned long long old = mddev->suspend_lo;
if (mddev->pers == NULL ||
mddev->pers->quiesce == NULL)
return -EINVAL;
if (buf == e || (*e && *e != '\n'))
return -EINVAL;
- if (new >= mddev->suspend_hi ||
- (new > mddev->suspend_lo && new < mddev->suspend_hi)) {
- mddev->suspend_lo = new;
+
+ mddev->suspend_lo = new;
+ if (new >= old)
+ /* Shrinking suspended region */
mddev->pers->quiesce(mddev, 2);
- return len;
- } else
- return -EINVAL;
+ else {
+ /* Expanding suspended region - need to wait */
+ mddev->pers->quiesce(mddev, 1);
+ mddev->pers->quiesce(mddev, 0);
+ }
+ return len;
}
static struct md_sysfs_entry md_suspend_lo =
__ATTR(suspend_lo, S_IRUGO|S_IWUSR, suspend_lo_show, suspend_lo_store);
@@ -4031,20 +4048,24 @@ suspend_hi_store(mddev_t *mddev, const char *buf, size_t len)
{
char *e;
unsigned long long new = simple_strtoull(buf, &e, 10);
+ unsigned long long old = mddev->suspend_hi;
if (mddev->pers == NULL ||
mddev->pers->quiesce == NULL)
return -EINVAL;
if (buf == e || (*e && *e != '\n'))
return -EINVAL;
- if ((new <= mddev->suspend_lo && mddev->suspend_lo >= mddev->suspend_hi) ||
- (new > mddev->suspend_lo && new > mddev->suspend_hi)) {
- mddev->suspend_hi = new;
+
+ mddev->suspend_hi = new;
+ if (new <= old)
+ /* Shrinking suspended region */
+ mddev->pers->quiesce(mddev, 2);
+ else {
+ /* Expanding suspended region - need to wait */
mddev->pers->quiesce(mddev, 1);
mddev->pers->quiesce(mddev, 0);
- return len;
- } else
- return -EINVAL;
+ }
+ return len;
}
static struct md_sysfs_entry md_suspend_hi =
__ATTR(suspend_hi, S_IRUGO|S_IWUSR, suspend_hi_show, suspend_hi_store);
@@ -4256,10 +4277,10 @@ static int md_alloc(dev_t dev, char *name)
shift = partitioned ? MdpMinorShift : 0;
unit = MINOR(mddev->unit) >> shift;
- /* wait for any previous instance if this device
- * to be completed removed (mddev_delayed_delete).
+ /* wait for any previous instance of this device to be
+ * completely removed (mddev_delayed_delete).
*/
- flush_scheduled_work();
+ flush_workqueue(md_misc_wq);
mutex_lock(&disks_mutex);
error = -EEXIST;
@@ -4287,9 +4308,6 @@ static int md_alloc(dev_t dev, char *name)
goto abort;
mddev->queue->queuedata = mddev;
- /* Can be unlocked because the queue is new: no concurrency */
- queue_flag_set_unlocked(QUEUE_FLAG_CLUSTER, mddev->queue);
-
blk_queue_make_request(mddev->queue, md_make_request);
disk = alloc_disk(1 << shift);
@@ -4329,6 +4347,8 @@ static int md_alloc(dev_t dev, char *name)
if (mddev->kobj.sd &&
sysfs_create_group(&mddev->kobj, &md_bitmap_group))
printk(KERN_DEBUG "pointless warning\n");
+
+ blk_queue_flush(mddev->queue, REQ_FLUSH | REQ_FUA);
abort:
mutex_unlock(&disks_mutex);
if (!error && mddev->kobj.sd) {
@@ -4423,7 +4443,9 @@ int md_run(mddev_t *mddev)
* We don't want the data to overlap the metadata,
* Internal Bitmap issues have been handled elsewhere.
*/
- if (rdev->data_offset < rdev->sb_start) {
+ if (rdev->meta_bdev) {
+ /* Nothing to check */;
+ } else if (rdev->data_offset < rdev->sb_start) {
if (mddev->dev_sectors &&
rdev->data_offset + mddev->dev_sectors
> rdev->sb_start) {
@@ -4442,6 +4464,9 @@ int md_run(mddev_t *mddev)
sysfs_notify_dirent_safe(rdev->sysfs_state);
}
+ if (mddev->bio_set == NULL)
+ mddev->bio_set = bioset_create(BIO_POOL_SIZE, sizeof(mddev));
+
spin_lock(&pers_lock);
pers = find_pers(mddev->level, mddev->clevel);
if (!pers || !try_module_get(pers->owner)) {
@@ -4504,7 +4529,6 @@ int md_run(mddev_t *mddev)
/* may be over-ridden by personality */
mddev->resync_max_sectors = mddev->dev_sectors;
- mddev->barriers_work = 1;
mddev->ok_start_degraded = start_dirty_degraded;
if (start_readonly && mddev->ro == 0)
@@ -4555,7 +4579,8 @@ int md_run(mddev_t *mddev)
mddev->safemode_timer.data = (unsigned long) mddev;
mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
mddev->in_sync = 1;
-
+ smp_wmb();
+ mddev->ready = 1;
list_for_each_entry(rdev, &mddev->disks, same_set)
if (rdev->raid_disk >= 0) {
char nm[20];
@@ -4683,7 +4708,6 @@ static void md_clean(mddev_t *mddev)
mddev->recovery = 0;
mddev->in_sync = 0;
mddev->degraded = 0;
- mddev->barriers_work = 0;
mddev->safemode = 0;
mddev->bitmap_info.offset = 0;
mddev->bitmap_info.default_offset = 0;
@@ -4693,13 +4717,12 @@ static void md_clean(mddev_t *mddev)
mddev->plug = NULL;
}
-void md_stop_writes(mddev_t *mddev)
+static void __md_stop_writes(mddev_t *mddev)
{
if (mddev->sync_thread) {
set_bit(MD_RECOVERY_FROZEN, &mddev->recovery);
set_bit(MD_RECOVERY_INTR, &mddev->recovery);
- md_unregister_thread(mddev->sync_thread);
- mddev->sync_thread = NULL;
+ reap_sync_thread(mddev);
}
del_timer_sync(&mddev->safemode_timer);
@@ -4713,10 +4736,18 @@ void md_stop_writes(mddev_t *mddev)
md_update_sb(mddev, 1);
}
}
+
+void md_stop_writes(mddev_t *mddev)
+{
+ mddev_lock(mddev);
+ __md_stop_writes(mddev);
+ mddev_unlock(mddev);
+}
EXPORT_SYMBOL_GPL(md_stop_writes);
void md_stop(mddev_t *mddev)
{
+ mddev->ready = 0;
mddev->pers->stop(mddev);
if (mddev->pers->sync_request && mddev->to_remove == NULL)
mddev->to_remove = &md_redundancy_group;
@@ -4736,7 +4767,7 @@ static int md_set_readonly(mddev_t *mddev, int is_open)
goto out;
}
if (mddev->pers) {
- md_stop_writes(mddev);
+ __md_stop_writes(mddev);
err = -ENXIO;
if (mddev->ro==1)
@@ -4773,7 +4804,7 @@ static int do_md_stop(mddev_t * mddev, int mode, int is_open)
if (mddev->ro)
set_disk_ro(disk, 0);
- md_stop_writes(mddev);
+ __md_stop_writes(mddev);
md_stop(mddev);
mddev->queue->merge_bvec_fn = NULL;
mddev->queue->unplug_fn = NULL;
@@ -5148,17 +5179,21 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
PTR_ERR(rdev));
return PTR_ERR(rdev);
}
- /* set save_raid_disk if appropriate */
+ /* set saved_raid_disk if appropriate */
if (!mddev->persistent) {
if (info->state & (1<<MD_DISK_SYNC) &&
- info->raid_disk < mddev->raid_disks)
+ info->raid_disk < mddev->raid_disks) {
rdev->raid_disk = info->raid_disk;
- else
+ set_bit(In_sync, &rdev->flags);
+ } else
rdev->raid_disk = -1;
} else
super_types[mddev->major_version].
validate_super(mddev, rdev);
- rdev->saved_raid_disk = rdev->raid_disk;
+ if (test_bit(In_sync, &rdev->flags))
+ rdev->saved_raid_disk = rdev->raid_disk;
+ else
+ rdev->saved_raid_disk = -1;
clear_bit(In_sync, &rdev->flags); /* just to be sure */
if (info->state & (1<<MD_DISK_WRITEMOSTLY))
@@ -5225,9 +5260,9 @@ static int add_new_disk(mddev_t * mddev, mdu_disk_info_t *info)
if (!mddev->persistent) {
printk(KERN_INFO "md: nonpersistent superblock ...\n");
- rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
- } else
- rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
+ } else
+ rdev->sb_start = calc_dev_sboffset(rdev);
rdev->sectors = rdev->sb_start;
err = bind_rdev_to_array(rdev, mddev);
@@ -5294,9 +5329,9 @@ static int hot_add_disk(mddev_t * mddev, dev_t dev)
}
if (mddev->persistent)
- rdev->sb_start = calc_dev_sboffset(rdev->bdev);
+ rdev->sb_start = calc_dev_sboffset(rdev);
else
- rdev->sb_start = rdev->bdev->bd_inode->i_size / 512;
+ rdev->sb_start = i_size_read(rdev->bdev->bd_inode) / 512;
rdev->sectors = rdev->sb_start;
@@ -5507,7 +5542,6 @@ static int update_size(mddev_t *mddev, sector_t num_sectors)
* sb_start or, if that is <data_offset, it must fit before the size
* of each device. If num_sectors is zero, we find the largest size
* that fits.
-
*/
if (mddev->sync_thread)
return -EBUSY;
@@ -5951,16 +5985,14 @@ static int md_open(struct block_device *bdev, fmode_t mode)
mddev_t *mddev = mddev_find(bdev->bd_dev);
int err;
- lock_kernel();
if (mddev->gendisk != bdev->bd_disk) {
/* we are racing with mddev_put which is discarding this
* bd_disk.
*/
mddev_put(mddev);
/* Wait until bdev->bd_disk is definitely gone */
- flush_scheduled_work();
+ flush_workqueue(md_misc_wq);
/* Then retry the open from the top */
- unlock_kernel();
return -ERESTARTSYS;
}
BUG_ON(mddev != bdev->bd_disk->private_data);
@@ -5974,7 +6006,6 @@ static int md_open(struct block_device *bdev, fmode_t mode)
check_disk_size_change(mddev->gendisk, bdev);
out:
- unlock_kernel();
return err;
}
@@ -5983,10 +6014,8 @@ static int md_release(struct gendisk *disk, fmode_t mode)
mddev_t *mddev = disk->private_data;
BUG_ON(!mddev);
- lock_kernel();
atomic_dec(&mddev->openers);
mddev_put(mddev);
- unlock_kernel();
return 0;
}
@@ -6036,8 +6065,8 @@ static int md_thread(void * arg)
thread->timeout);
clear_bit(THREAD_WAKEUP, &thread->flags);
-
- thread->run(thread->mddev);
+ if (!kthread_should_stop())
+ thread->run(thread->mddev);
}
return 0;
@@ -6118,7 +6147,7 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)
set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
md_wakeup_thread(mddev->thread);
if (mddev->event_work.func)
- schedule_work(&mddev->event_work);
+ queue_work(md_misc_wq, &mddev->event_work);
md_new_event_inintr(mddev);
}
@@ -6802,7 +6831,7 @@ void md_do_sync(mddev_t *mddev)
desc, mdname(mddev));
mddev->curr_resync = j;
}
- mddev->curr_resync_completed = mddev->curr_resync;
+ mddev->curr_resync_completed = j;
while (j < max_sectors) {
sector_t sectors;
@@ -6820,8 +6849,7 @@ void md_do_sync(mddev_t *mddev)
md_unplug(mddev);
wait_event(mddev->recovery_wait,
atomic_read(&mddev->recovery_active) == 0);
- mddev->curr_resync_completed =
- mddev->curr_resync;
+ mddev->curr_resync_completed = j;
set_bit(MD_CHANGE_CLEAN, &mddev->flags);
sysfs_notify(&mddev->kobj, NULL, "sync_completed");
}
@@ -7026,6 +7054,45 @@ static int remove_and_add_spares(mddev_t *mddev)
}
return spares;
}
+
+static void reap_sync_thread(mddev_t *mddev)
+{
+ mdk_rdev_t *rdev;
+
+ /* resync has finished, collect result */
+ md_unregister_thread(mddev->sync_thread);
+ mddev->sync_thread = NULL;
+ if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+ !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
+ /* success...*/
+ /* activate any spares */
+ if (mddev->pers->spare_active(mddev))
+ sysfs_notify(&mddev->kobj, NULL,
+ "degraded");
+ }
+ if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+ mddev->pers->finish_reshape)
+ mddev->pers->finish_reshape(mddev);
+ md_update_sb(mddev, 1);
+
+ /* if array is no-longer degraded, then any saved_raid_disk
+ * information must be scrapped
+ */
+ if (!mddev->degraded)
+ list_for_each_entry(rdev, &mddev->disks, same_set)
+ rdev->saved_raid_disk = -1;
+
+ clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
+ /* flag recovery needed just to double check */
+ set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+ sysfs_notify_dirent_safe(mddev->sysfs_action);
+ md_new_event(mddev);
+}
+
/*
* This routine is regularly called by all per-raid-array threads to
* deal with generic issues like resync and super-block update.
@@ -7050,9 +7117,6 @@ static int remove_and_add_spares(mddev_t *mddev)
*/
void md_check_recovery(mddev_t *mddev)
{
- mdk_rdev_t *rdev;
-
-
if (mddev->bitmap)
bitmap_daemon_work(mddev);
@@ -7120,34 +7184,7 @@ void md_check_recovery(mddev_t *mddev)
goto unlock;
}
if (mddev->sync_thread) {
- /* resync has finished, collect result */
- md_unregister_thread(mddev->sync_thread);
- mddev->sync_thread = NULL;
- if (!test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
- !test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) {
- /* success...*/
- /* activate any spares */
- if (mddev->pers->spare_active(mddev))
- sysfs_notify(&mddev->kobj, NULL,
- "degraded");
- }
- if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
- mddev->pers->finish_reshape)
- mddev->pers->finish_reshape(mddev);
- md_update_sb(mddev, 1);
-
- /* if array is no-longer degraded, then any saved_raid_disk
- * information must be scrapped
- */
- if (!mddev->degraded)
- list_for_each_entry(rdev, &mddev->disks, same_set)
- rdev->saved_raid_disk = -1;
-
- mddev->recovery = 0;
- /* flag recovery needed just to double check */
- set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
- sysfs_notify_dirent_safe(mddev->sysfs_action);
- md_new_event(mddev);
+ reap_sync_thread(mddev);
goto unlock;
}
/* Set RUNNING before clearing NEEDED to avoid
@@ -7205,7 +7242,11 @@ void md_check_recovery(mddev_t *mddev)
" thread...\n",
mdname(mddev));
/* leave the spares where they are, it shouldn't hurt */
- mddev->recovery = 0;
+ clear_bit(MD_RECOVERY_RUNNING, &mddev->recovery);
+ clear_bit(MD_RECOVERY_SYNC, &mddev->recovery);
+ clear_bit(MD_RECOVERY_RESHAPE, &mddev->recovery);
+ clear_bit(MD_RECOVERY_REQUESTED, &mddev->recovery);
+ clear_bit(MD_RECOVERY_CHECK, &mddev->recovery);
} else
md_wakeup_thread(mddev->sync_thread);
sysfs_notify_dirent_safe(mddev->sysfs_action);
@@ -7278,12 +7319,23 @@ static void md_geninit(void)
static int __init md_init(void)
{
- if (register_blkdev(MD_MAJOR, "md"))
- return -1;
- if ((mdp_major=register_blkdev(0, "mdp"))<=0) {
- unregister_blkdev(MD_MAJOR, "md");
- return -1;
- }
+ int ret = -ENOMEM;
+
+ md_wq = alloc_workqueue("md", WQ_RESCUER, 0);
+ if (!md_wq)
+ goto err_wq;
+
+ md_misc_wq = alloc_workqueue("md_misc", 0, 0);
+ if (!md_misc_wq)
+ goto err_misc_wq;
+
+ if ((ret = register_blkdev(MD_MAJOR, "md")) < 0)
+ goto err_md;
+
+ if ((ret = register_blkdev(0, "mdp")) < 0)
+ goto err_mdp;
+ mdp_major = ret;
+
blk_register_region(MKDEV(MD_MAJOR, 0), 1UL<<MINORBITS, THIS_MODULE,
md_probe, NULL, NULL);
blk_register_region(MKDEV(mdp_major, 0), 1UL<<MINORBITS, THIS_MODULE,
@@ -7294,8 +7346,16 @@ static int __init md_init(void)
md_geninit();
return 0;
-}
+err_mdp:
+ unregister_blkdev(MD_MAJOR, "md");
+err_md:
+ destroy_workqueue(md_misc_wq);
+err_misc_wq:
+ destroy_workqueue(md_wq);
+err_wq:
+ return ret;
+}
#ifndef MODULE
@@ -7382,6 +7442,8 @@ static __exit void md_exit(void)
export_array(mddev);
mddev->hold_active = 0;
}
+ destroy_workqueue(md_misc_wq);
+ destroy_workqueue(md_wq);
}
subsys_initcall(md_init);