From 532a2a3fba8df076d65fdf17518eeb327b37a313 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:30:52 +1100 Subject: md: raid 10 supports TRIM This makes md raid 10 support TRIM. If one disk supports discard and another not, or one has discard_zero_data and another not, there could be inconsistent between data from such disks. But this should not matter, discarded data is useless. This will add extra copy in rebuild though. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/raid10.c | 29 +++++++++++++++++++++++++---- 1 file changed, 25 insertions(+), 4 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 1c2eb38f3c51..f92e0ed59be0 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -911,7 +911,12 @@ static void flush_pending_writes(struct r10conf *conf) while (bio) { /* submit pending writes */ struct bio *next = bio->bi_next; bio->bi_next = NULL; - generic_make_request(bio); + if (unlikely((bio->bi_rw & REQ_DISCARD) && + !blk_queue_discard(bdev_get_queue(bio->bi_bdev)))) + /* Just ignore it */ + bio_endio(bio, 0); + else + generic_make_request(bio); bio = next; } } else @@ -1061,6 +1066,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) const int rw = bio_data_dir(bio); const unsigned long do_sync = (bio->bi_rw & REQ_SYNC); const unsigned long do_fua = (bio->bi_rw & REQ_FUA); + const unsigned long do_discard = (bio->bi_rw + & (REQ_DISCARD | REQ_SECURE)); unsigned long flags; struct md_rdev *blocked_rdev; int sectors_handled; @@ -1081,7 +1088,7 @@ static void make_request(struct mddev *mddev, struct bio * bio) || conf->prev.near_copies < conf->prev.raid_disks))) { struct bio_pair *bp; /* Sanity check -- queue functions should prevent this happening */ - if (bio->bi_vcnt != 1 || + if ((bio->bi_vcnt != 1 && bio->bi_vcnt != 0) || bio->bi_idx != 0) goto bad_map; /* This is a one page bio that upper layers @@ -1410,7 +1417,7 @@ retry_write: conf->mirrors[d].rdev)); mbio->bi_bdev = conf->mirrors[d].rdev->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua; + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -1439,7 +1446,7 @@ retry_write: conf->mirrors[d].replacement)); mbio->bi_bdev = conf->mirrors[d].replacement->bdev; mbio->bi_end_io = raid10_end_write_request; - mbio->bi_rw = WRITE | do_sync | do_fua; + mbio->bi_rw = WRITE | do_sync | do_fua | do_discard; mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); @@ -1723,6 +1730,9 @@ static int raid10_add_disk(struct mddev *mddev, struct md_rdev *rdev) clear_bit(Unmerged, &rdev->flags); } md_integrity_add_rdev(rdev, mddev); + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + print_conf(conf); return err; } @@ -3480,6 +3490,7 @@ static int run(struct mddev *mddev) sector_t size; sector_t min_offset_diff = 0; int first = 1; + bool discard_supported = false; if (mddev->private == NULL) { conf = setup_conf(mddev); @@ -3496,6 +3507,8 @@ static int run(struct mddev *mddev) chunk_size = mddev->chunk_sectors << 9; if (mddev->queue) { + blk_queue_max_discard_sectors(mddev->queue, + mddev->chunk_sectors); blk_queue_io_min(mddev->queue, chunk_size); if (conf->geo.raid_disks % conf->geo.near_copies) blk_queue_io_opt(mddev->queue, chunk_size * conf->geo.raid_disks); @@ -3541,8 +3554,16 @@ static int run(struct mddev *mddev) rdev->data_offset << 9); disk->head_position = 0; + + if (blk_queue_discard(bdev_get_queue(rdev->bdev))) + discard_supported = true; } + if (discard_supported) + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + else + queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue); + /* need to check that every block has at least one working mirror */ if (!enough(conf, -1)) { printk(KERN_ERR "md/raid10:%s: not enough operational mirrors.\n", -- cgit 1.4.1 From 57c67df48866d57b50d72eb198ffcc0cf7a6232d Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 11 Oct 2012 13:32:13 +1100 Subject: md/raid10: submit IO from originating thread instead of md thread. queuing writes to the md thread means that all requests go through the one processor which may not be able to keep up with very high request rates. So use the plugging infrastructure to submit all requests on unplug. If a 'schedule' is needed, we fall back on the old approach of handing the requests to the thread for it to handle. This is nearly identical to a recent patch which provided similar functionality to RAID1. Signed-off-by: NeilBrown --- drivers/md/raid10.c | 57 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 54 insertions(+), 3 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index f92e0ed59be0..05dc96a950d5 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1055,6 +1055,44 @@ static sector_t choose_data_offset(struct r10bio *r10_bio, return rdev->new_data_offset; } +struct raid10_plug_cb { + struct blk_plug_cb cb; + struct bio_list pending; + int pending_cnt; +}; + +static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule) +{ + struct raid10_plug_cb *plug = container_of(cb, struct raid10_plug_cb, + cb); + struct mddev *mddev = plug->cb.data; + struct r10conf *conf = mddev->private; + struct bio *bio; + + if (from_schedule) { + spin_lock_irq(&conf->device_lock); + bio_list_merge(&conf->pending_bio_list, &plug->pending); + conf->pending_count += plug->pending_cnt; + spin_unlock_irq(&conf->device_lock); + md_wakeup_thread(mddev->thread); + kfree(plug); + return; + } + + /* we aren't scheduling, so we can do the write-out directly. */ + bio = bio_list_get(&plug->pending); + bitmap_unplug(mddev->bitmap); + wake_up(&conf->wait_barrier); + + while (bio) { /* submit pending writes */ + struct bio *next = bio->bi_next; + bio->bi_next = NULL; + generic_make_request(bio); + bio = next; + } + kfree(plug); +} + static void make_request(struct mddev *mddev, struct bio * bio) { struct r10conf *conf = mddev->private; @@ -1070,6 +1108,8 @@ static void make_request(struct mddev *mddev, struct bio * bio) & (REQ_DISCARD | REQ_SECURE)); unsigned long flags; struct md_rdev *blocked_rdev; + struct blk_plug_cb *cb; + struct raid10_plug_cb *plug = NULL; int sectors_handled; int max_sectors; int sectors; @@ -1421,11 +1461,22 @@ retry_write: mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); + + cb = blk_check_plugged(raid10_unplug, mddev, sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid10_plug_cb, cb); + else + plug = NULL; spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) + if (!plug) md_wakeup_thread(mddev->thread); if (!r10_bio->devs[i].repl_bio) -- cgit 1.4.1 From 4ed8731d8e6bd2a88a30697fbf4f7e6e979a6c46 Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 11 Oct 2012 13:34:00 +1100 Subject: MD: change the parameter of md thread Change the thread parameter, so the thread can carry extra info. Next patch will use it. Signed-off-by: Shaohua Li Signed-off-by: NeilBrown --- drivers/md/md.c | 9 +++++---- drivers/md/md.h | 7 ++++--- drivers/md/multipath.c | 3 ++- drivers/md/raid1.c | 3 ++- drivers/md/raid10.c | 3 ++- drivers/md/raid5.c | 3 ++- 6 files changed, 17 insertions(+), 11 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 7a2b0793f66e..8e842b326ebd 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -6641,7 +6641,7 @@ static int md_thread(void * arg) clear_bit(THREAD_WAKEUP, &thread->flags); if (!kthread_should_stop()) - thread->run(thread->mddev); + thread->run(thread); } return 0; @@ -6656,8 +6656,8 @@ void md_wakeup_thread(struct md_thread *thread) } } -struct md_thread *md_register_thread(void (*run) (struct mddev *), struct mddev *mddev, - const char *name) +struct md_thread *md_register_thread(void (*run) (struct md_thread *), + struct mddev *mddev, const char *name) { struct md_thread *thread; @@ -7206,8 +7206,9 @@ EXPORT_SYMBOL_GPL(md_allow_write); #define SYNC_MARKS 10 #define SYNC_MARK_STEP (3*HZ) -void md_do_sync(struct mddev *mddev) +void md_do_sync(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct mddev *mddev2; unsigned int currspeed = 0, window; diff --git a/drivers/md/md.h b/drivers/md/md.h index f385b038589d..93ada214b50e 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -540,12 +540,13 @@ static inline void sysfs_unlink_rdev(struct mddev *mddev, struct md_rdev *rdev) list_for_each_entry_rcu(rdev, &((mddev)->disks), same_set) struct md_thread { - void (*run) (struct mddev *mddev); + void (*run) (struct md_thread *thread); struct mddev *mddev; wait_queue_head_t wqueue; unsigned long flags; struct task_struct *tsk; unsigned long timeout; + void *private; }; #define THREAD_WAKEUP 0 @@ -584,7 +585,7 @@ static inline void safe_put_page(struct page *p) extern int register_md_personality(struct md_personality *p); extern int unregister_md_personality(struct md_personality *p); extern struct md_thread *md_register_thread( - void (*run)(struct mddev *mddev), + void (*run)(struct md_thread *thread), struct mddev *mddev, const char *name); extern void md_unregister_thread(struct md_thread **threadp); @@ -603,7 +604,7 @@ extern void md_super_write(struct mddev *mddev, struct md_rdev *rdev, extern void md_super_wait(struct mddev *mddev); extern int sync_page_io(struct md_rdev *rdev, sector_t sector, int size, struct page *page, int rw, bool metadata_op); -extern void md_do_sync(struct mddev *mddev); +extern void md_do_sync(struct md_thread *thread); extern void md_new_event(struct mddev *mddev); extern int md_allow_write(struct mddev *mddev); extern void md_wait_for_blocked_rdev(struct md_rdev *rdev, struct mddev *mddev); diff --git a/drivers/md/multipath.c b/drivers/md/multipath.c index 61a1833ebaf3..1642eae75a33 100644 --- a/drivers/md/multipath.c +++ b/drivers/md/multipath.c @@ -335,8 +335,9 @@ abort: * 3. Performs writes following reads for array syncronising. */ -static void multipathd (struct mddev *mddev) +static void multipathd(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct multipath_bh *mp_bh; struct bio *bio; unsigned long flags; diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 05c557e8f862..55ccf4730536 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -2294,8 +2294,9 @@ read_more: } } -static void raid1d(struct mddev *mddev) +static void raid1d(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct r1bio *r1_bio; unsigned long flags; struct r1conf *conf = mddev->private; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 05dc96a950d5..54860604d097 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2732,8 +2732,9 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio) } } -static void raid10d(struct mddev *mddev) +static void raid10d(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct r10bio *r10_bio; unsigned long flags; struct r10conf *conf = mddev->private; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index adda94df5eb2..81c02d63440b 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4625,8 +4625,9 @@ static int handle_active_stripes(struct r5conf *conf) * During the scan, completed stripes are saved for us by the interrupt * handler, so that they will not have to wait for our next wakeup. */ -static void raid5d(struct mddev *mddev) +static void raid5d(struct md_thread *thread) { + struct mddev *mddev = thread->mddev; struct r5conf *conf = mddev->private; int handled; struct blk_plug plug; -- cgit 1.4.1 From 2863b9eb44787adecba4f977d71d7fd876805b1c Mon Sep 17 00:00:00 2001 From: Jonathan Brassow Date: Thu, 11 Oct 2012 13:38:58 +1100 Subject: MD RAID10: Prep for DM RAID10 device replacement capability MD RAID10: Fix a couple potential kernel panics if RAID10 is used by dm-raid When device-mapper uses the RAID10 personality through dm-raid.c, there is no 'gendisk' structure in mddev and some sysfs information is also not populated. This patch avoids touching those non-existent structures. Signed-off-by: Jonathan Brassow Signed-off-by: NeilBrown --- drivers/md/md.c | 10 ++++++++-- drivers/md/raid10.c | 2 +- 2 files changed, 9 insertions(+), 3 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index feab588adb50..85e6786653ea 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -2044,8 +2044,14 @@ EXPORT_SYMBOL(md_integrity_register); /* Disable data integrity if non-capable/non-matching disk is being added */ void md_integrity_add_rdev(struct md_rdev *rdev, struct mddev *mddev) { - struct blk_integrity *bi_rdev = bdev_get_integrity(rdev->bdev); - struct blk_integrity *bi_mddev = blk_get_integrity(mddev->gendisk); + struct blk_integrity *bi_rdev; + struct blk_integrity *bi_mddev; + + if (!mddev->gendisk) + return; + + bi_rdev = bdev_get_integrity(rdev->bdev); + bi_mddev = blk_get_integrity(mddev->gendisk); if (!bi_mddev) /* nothing to do */ return; diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 54860604d097..fb5bd607e15c 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1694,7 +1694,7 @@ static int raid10_spare_active(struct mddev *mddev) && !test_bit(Faulty, &tmp->rdev->flags) && !test_and_set_bit(In_sync, &tmp->rdev->flags)) { count++; - sysfs_notify_dirent(tmp->rdev->sysfs_state); + sysfs_notify_dirent_safe(tmp->rdev->sysfs_state); } } spin_lock_irqsave(&conf->device_lock, flags); -- cgit 1.4.1 From 7f7583d420231b9d09897afd57a957011b606a5b Mon Sep 17 00:00:00 2001 From: Jianpeng Ma Date: Thu, 11 Oct 2012 14:17:59 +1100 Subject: Subject: [PATCH] md:change resync_mismatches to atomic64_t to avoid races Now that multiple threads can handle stripes, it is safer to use an atomic64_t for resync_mismatches, to avoid update races. Signed-off-by: Jianpeng Ma Signed-off-by: NeilBrown --- drivers/md/md.c | 7 ++++--- drivers/md/md.h | 2 +- drivers/md/raid1.c | 2 +- drivers/md/raid10.c | 2 +- drivers/md/raid5.c | 4 ++-- 5 files changed, 9 insertions(+), 8 deletions(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/md.c b/drivers/md/md.c index 85e6786653ea..7564c44b8045 100644 --- a/drivers/md/md.c +++ b/drivers/md/md.c @@ -4269,7 +4269,8 @@ static ssize_t mismatch_cnt_show(struct mddev *mddev, char *page) { return sprintf(page, "%llu\n", - (unsigned long long) mddev->resync_mismatches); + (unsigned long long) + atomic64_read(&mddev->resync_mismatches)); } static struct md_sysfs_entry md_scan_mode = @@ -5235,7 +5236,7 @@ static void md_clean(struct mddev *mddev) mddev->new_layout = 0; mddev->new_chunk_sectors = 0; mddev->curr_resync = 0; - mddev->resync_mismatches = 0; + atomic64_set(&mddev->resync_mismatches, 0); mddev->suspend_lo = mddev->suspend_hi = 0; mddev->sync_speed_min = mddev->sync_speed_max = 0; mddev->recovery = 0; @@ -7357,7 +7358,7 @@ void md_do_sync(struct md_thread *thread) * which defaults to physical size, but can be virtual size */ max_sectors = mddev->resync_max_sectors; - mddev->resync_mismatches = 0; + atomic64_set(&mddev->resync_mismatches, 0); /* we don't use the checkpoint if there's a bitmap */ if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery)) j = mddev->resync_min; diff --git a/drivers/md/md.h b/drivers/md/md.h index 93ada214b50e..af443ab868db 100644 --- a/drivers/md/md.h +++ b/drivers/md/md.h @@ -282,7 +282,7 @@ struct mddev { sector_t resync_max_sectors; /* may be set by personality */ - sector_t resync_mismatches; /* count of sectors where + atomic64_t resync_mismatches; /* count of sectors where * parity/replica mismatch found */ diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index e913356b257e..8034fbd6190c 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1876,7 +1876,7 @@ static int process_checks(struct r1bio *r1_bio) } else j = 0; if (j >= 0) - mddev->resync_mismatches += r1_bio->sectors; + atomic64_add(r1_bio->sectors, &mddev->resync_mismatches); if (j < 0 || (test_bit(MD_RECOVERY_CHECK, &mddev->recovery) && test_bit(BIO_UPTODATE, &sbio->bi_flags))) { /* No need to write to this device. */ diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index fb5bd607e15c..146749b277c6 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -2011,7 +2011,7 @@ static void sync_request_write(struct mddev *mddev, struct r10bio *r10_bio) break; if (j == vcnt) continue; - mddev->resync_mismatches += r10_bio->sectors; + atomic64_add(r10_bio->sectors, &mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &mddev->recovery)) /* Don't fix anything. */ continue; diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index ab613efbbead..4cc6b0e398bc 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2973,7 +2973,7 @@ static void handle_parity_checks5(struct r5conf *conf, struct stripe_head *sh, */ set_bit(STRIPE_INSYNC, &sh->state); else { - conf->mddev->resync_mismatches += STRIPE_SECTORS; + atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); @@ -3125,7 +3125,7 @@ static void handle_parity_checks6(struct r5conf *conf, struct stripe_head *sh, */ } } else { - conf->mddev->resync_mismatches += STRIPE_SECTORS; + atomic64_add(STRIPE_SECTORS, &conf->mddev->resync_mismatches); if (test_bit(MD_RECOVERY_CHECK, &conf->mddev->recovery)) /* don't try to repair!! */ set_bit(STRIPE_INSYNC, &sh->state); -- cgit 1.4.1 From 91502f099dfc5a1e8812898e26ee280713e1d002 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 11 Oct 2012 14:20:58 +1100 Subject: md/raid10: use correct limit variable Clang complains that we are assigning a variable to itself. This should be using bad_sectors like the similar earlier check does. Bug has been present since 3.1-rc1. It is minor but could conceivably cause corruption or other bad behaviour. Cc: stable@vger.kernel.org Signed-off-by: Dan Carpenter Signed-off-by: NeilBrown --- drivers/md/raid10.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'drivers/md/raid10.c') diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index 146749b277c6..867d1b4e9634 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -3218,7 +3218,7 @@ static sector_t sync_request(struct mddev *mddev, sector_t sector_nr, else { bad_sectors -= (sector - first_bad); if (max_sync > bad_sectors) - max_sync = max_sync; + max_sync = bad_sectors; continue; } } -- cgit 1.4.1