From fe0b393f2c0a0d23a9bc9ed7dc51a1ee511098bd Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Mon, 11 Jan 2010 03:21:47 -0500 Subject: block: Correct handling of bottom device misaligment The top device misalignment flag would not be set if the added bottom device was already misaligned as opposed to causing a stacking failure. Also massage the reporting so that an error is only returned if adding the bottom device caused the misalignment. I.e. don't return an error if the top is already flagged as misaligned. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/blk-settings.c | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index d52d4adc440b..127f82551855 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -528,7 +528,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset) { sector_t alignment; - unsigned int top, bottom; + unsigned int top, bottom, ret = 0; t->max_sectors = min_not_zero(t->max_sectors, b->max_sectors); t->max_hw_sectors = min_not_zero(t->max_hw_sectors, b->max_hw_sectors); @@ -546,6 +546,8 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, t->max_segment_size = min_not_zero(t->max_segment_size, b->max_segment_size); + t->misaligned |= b->misaligned; + alignment = queue_limit_alignment_offset(b, offset); /* Bottom device has different alignment. Check that it is @@ -558,8 +560,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, bottom = max(b->physical_block_size, b->io_min) + alignment; /* Verify that top and bottom intervals line up */ - if (max(top, bottom) & (min(top, bottom) - 1)) + if (max(top, bottom) & (min(top, bottom) - 1)) { t->misaligned = 1; + ret = -1; + } } t->logical_block_size = max(t->logical_block_size, @@ -578,18 +582,21 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, if (t->physical_block_size & (t->logical_block_size - 1)) { t->physical_block_size = t->logical_block_size; t->misaligned = 1; + ret = -1; } /* Minimum I/O a multiple of the physical block size? */ if (t->io_min & (t->physical_block_size - 1)) { t->io_min = t->physical_block_size; t->misaligned = 1; + ret = -1; } /* Optimal I/O a multiple of the physical block size? */ if (t->io_opt & (t->physical_block_size - 1)) { t->io_opt = 0; t->misaligned = 1; + ret = -1; } /* Find lowest common alignment_offset */ @@ -597,8 +604,10 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, & (max(t->physical_block_size, t->io_min) - 1); /* Verify that new alignment_offset is on a logical block boundary */ - if (t->alignment_offset & (t->logical_block_size - 1)) + if (t->alignment_offset & (t->logical_block_size - 1)) { t->misaligned = 1; + ret = -1; + } /* Discard alignment and granularity */ if (b->discard_granularity) { @@ -626,7 +635,7 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, (t->discard_granularity - 1); } - return t->misaligned ? -1 : 0; + return ret; } EXPORT_SYMBOL(blk_stack_limits); -- cgit 1.4.1 From dd3d145d49c5816b79acc6761ebbd842bc50b0ee Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Mon, 11 Jan 2010 03:21:48 -0500 Subject: block: Fix discard alignment calculation and printing Discard alignment reporting for partitions was incorrect. Update to match the algorithm used elsewhere. The alignment can be negative (misaligned). Fix format string accordingly. Signed-off-by: Martin K. Petersen Signed-off-by: Jens Axboe --- block/genhd.c | 2 +- include/linux/blkdev.h | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) (limited to 'block') diff --git a/block/genhd.c b/block/genhd.c index b11a4ad7d571..d13ba76a169c 100644 --- a/block/genhd.c +++ b/block/genhd.c @@ -867,7 +867,7 @@ static ssize_t disk_discard_alignment_show(struct device *dev, { struct gendisk *disk = dev_to_disk(dev); - return sprintf(buf, "%u\n", queue_discard_alignment(disk->queue)); + return sprintf(buf, "%d\n", queue_discard_alignment(disk->queue)); } static DEVICE_ATTR(range, S_IRUGO, disk_range_show, NULL); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 9b98173a8184..a41bcc8e140f 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -1148,8 +1148,11 @@ static inline int queue_discard_alignment(struct request_queue *q) static inline int queue_sector_discard_alignment(struct request_queue *q, sector_t sector) { - return ((sector << 9) - q->limits.discard_alignment) - & (q->limits.discard_granularity - 1); + struct queue_limits *lim = &q->limits; + unsigned int alignment = (sector << 9) & (lim->discard_granularity - 1); + + return (lim->discard_granularity + lim->discard_alignment - alignment) + & (lim->discard_granularity - 1); } static inline unsigned int queue_discard_zeroes_data(struct request_queue *q) -- cgit 1.4.1 From 17be8c245054b9c7786545af3ba3ca4e54cd4ad9 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Mon, 11 Jan 2010 03:21:49 -0500 Subject: block: bdev_stack_limits wrapper DM does not want to know about partition offsets. Add a partition-aware wrapper that DM can use when stacking block devices. Signed-off-by: Martin K. Petersen Acked-by: Mike Snitzer Reviewed-by: Alasdair G Kergon Signed-off-by: Jens Axboe --- block/blk-settings.c | 22 ++++++++++++++++++++++ include/linux/blkdev.h | 2 ++ 2 files changed, 24 insertions(+) (limited to 'block') diff --git a/block/blk-settings.c b/block/blk-settings.c index 127f82551855..5eeb9e0d256e 100644 --- a/block/blk-settings.c +++ b/block/blk-settings.c @@ -639,6 +639,28 @@ int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, } EXPORT_SYMBOL(blk_stack_limits); +/** + * bdev_stack_limits - adjust queue limits for stacked drivers + * @t: the stacking driver limits (top device) + * @bdev: the component block_device (bottom) + * @start: first data sector within component device + * + * Description: + * Merges queue limits for a top device and a block_device. Returns + * 0 if alignment didn't change. Returns -1 if adding the bottom + * device caused misalignment. + */ +int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev, + sector_t start) +{ + struct request_queue *bq = bdev_get_queue(bdev); + + start += get_start_sect(bdev); + + return blk_stack_limits(t, &bq->limits, start << 9); +} +EXPORT_SYMBOL(bdev_stack_limits); + /** * disk_stack_limits - adjust queue limits for stacked drivers * @disk: MD/DM gendisk (top) diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index a41bcc8e140f..5c8018977efa 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -938,6 +938,8 @@ extern void blk_queue_io_opt(struct request_queue *q, unsigned int opt); extern void blk_set_default_limits(struct queue_limits *lim); extern int blk_stack_limits(struct queue_limits *t, struct queue_limits *b, sector_t offset); +extern int bdev_stack_limits(struct queue_limits *t, struct block_device *bdev, + sector_t offset); extern void disk_stack_limits(struct gendisk *disk, struct block_device *bdev, sector_t offset); extern void blk_queue_stack_limits(struct request_queue *t, struct request_queue *b); -- cgit 1.4.1 From ce289321b7dc1eb108e3df0dec872b7429ef49f7 Mon Sep 17 00:00:00 2001 From: Kirill Afonshin Date: Fri, 8 Jan 2010 22:09:59 +0300 Subject: block: removed unused as_io_context It isn't used anymore, since AS was deleted. Signed-off-by: Jens Axboe --- block/blk-ioc.c | 5 ----- include/linux/iocontext.h | 27 --------------------------- 2 files changed, 32 deletions(-) (limited to 'block') diff --git a/block/blk-ioc.c b/block/blk-ioc.c index cbdabb0dd6d7..98e6bf61b0ac 100644 --- a/block/blk-ioc.c +++ b/block/blk-ioc.c @@ -39,8 +39,6 @@ int put_io_context(struct io_context *ioc) if (atomic_long_dec_and_test(&ioc->refcount)) { rcu_read_lock(); - if (ioc->aic && ioc->aic->dtor) - ioc->aic->dtor(ioc->aic); cfq_dtor(ioc); rcu_read_unlock(); @@ -76,8 +74,6 @@ void exit_io_context(struct task_struct *task) task_unlock(task); if (atomic_dec_and_test(&ioc->nr_tasks)) { - if (ioc->aic && ioc->aic->exit) - ioc->aic->exit(ioc->aic); cfq_exit(ioc); } @@ -97,7 +93,6 @@ struct io_context *alloc_io_context(gfp_t gfp_flags, int node) ret->ioprio = 0; ret->last_waited = jiffies; /* doesn't matter... */ ret->nr_batch_requests = 0; /* because this is 0 */ - ret->aic = NULL; INIT_RADIX_TREE(&ret->radix_root, GFP_ATOMIC | __GFP_HIGH); INIT_HLIST_HEAD(&ret->cic_list); ret->ioc_data = NULL; diff --git a/include/linux/iocontext.h b/include/linux/iocontext.h index a63235996309..78ef023227d4 100644 --- a/include/linux/iocontext.h +++ b/include/linux/iocontext.h @@ -4,32 +4,6 @@ #include #include -/* - * This is the per-process anticipatory I/O scheduler state. - */ -struct as_io_context { - spinlock_t lock; - - void (*dtor)(struct as_io_context *aic); /* destructor */ - void (*exit)(struct as_io_context *aic); /* called on task exit */ - - unsigned long state; - atomic_t nr_queued; /* queued reads & sync writes */ - atomic_t nr_dispatched; /* number of requests gone to the drivers */ - - /* IO History tracking */ - /* Thinktime */ - unsigned long last_end_request; - unsigned long ttime_total; - unsigned long ttime_samples; - unsigned long ttime_mean; - /* Layout pattern */ - unsigned int seek_samples; - sector_t last_request_pos; - u64 seek_total; - sector_t seek_mean; -}; - struct cfq_queue; struct cfq_io_context { void *key; @@ -78,7 +52,6 @@ struct io_context { unsigned long last_waited; /* Time last woken after wait for request */ int nr_batch_requests; /* Number of requests left in the batch */ - struct as_io_context *aic; struct radix_tree_root radix_root; struct hlist_head cic_list; void *ioc_data; -- cgit 1.4.1 From 875feb63b9567442be73efbcc9a8470e376d6423 Mon Sep 17 00:00:00 2001 From: Divyesh Shah Date: Wed, 6 Jan 2010 18:58:20 -0800 Subject: cfq-iosched: Respect ioprio_class when preempting In cfq_should_preempt(), we currently allow some cases where a non-RT request can preempt an ongoing RT cfqq timeslice. This should not happen. Examples include: o A sync_noidle wl type non-RT request pre-empting a sync_noidle wl type cfqq on which we are idling. o Once we have per-cgroup async queues, a non-RT sync request pre-empting a RT async cfqq. Signed-off-by: Divyesh Shah Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 918c7fd9aeb1..ee130f14d1fc 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -3076,6 +3076,12 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq, if (cfq_class_idle(cfqq)) return true; + /* + * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice. + */ + if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq)) + return false; + /* * if the new request is sync, but the currently running queue is * not, let the sync request have priority. -- cgit 1.4.1 From bcf4dd43424cdfd8195f3955300a579fe58e9911 Mon Sep 17 00:00:00 2001 From: Gui Jianfeng Date: Mon, 1 Feb 2010 09:58:54 +0100 Subject: blk-cgroup: Fix potential deadlock in blk-cgroup I triggered a lockdep warning as following. ======================================================= [ INFO: possible circular locking dependency detected ] 2.6.33-rc2 #1 ------------------------------------------------------- test_io_control/7357 is trying to acquire lock: (blkio_list_lock){+.+...}, at: [] blkiocg_weight_write+0x82/0x9e but task is already holding lock: (&(&blkcg->lock)->rlock){......}, at: [] blkiocg_weight_write+0x3b/0x9e which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #2 (&(&blkcg->lock)->rlock){......}: [] validate_chain+0x8bc/0xb9c [] __lock_acquire+0x723/0x789 [] lock_acquire+0x90/0xa7 [] _raw_spin_lock_irqsave+0x27/0x5a [] blkiocg_add_blkio_group+0x1a/0x6d [] cfq_get_queue+0x225/0x3de [] cfq_set_request+0x217/0x42d [] elv_set_request+0x17/0x26 [] get_request+0x203/0x2c5 [] get_request_wait+0x18/0x10e [] __make_request+0x2ba/0x375 [] generic_make_request+0x28d/0x30f [] submit_bio+0x8a/0x8f [] submit_bh+0xf0/0x10f [] ll_rw_block+0xc0/0xf9 [] ext3_find_entry+0x319/0x544 [ext3] [] ext3_lookup+0x2c/0xb9 [ext3] [] do_lookup+0xd3/0x172 [] link_path_walk+0x5fb/0x95c [] path_walk+0x3c/0x81 [] do_path_lookup+0x21/0x8a [] do_filp_open+0xf0/0x978 [] open_exec+0x1b/0xb7 [] do_execve+0xbb/0x266 [] sys_execve+0x24/0x4a [] ptregs_execve+0x12/0x18 -> #1 (&(&q->__queue_lock)->rlock){..-.-.}: [] validate_chain+0x8bc/0xb9c [] __lock_acquire+0x723/0x789 [] lock_acquire+0x90/0xa7 [] _raw_spin_lock_irqsave+0x27/0x5a [] cfq_unlink_blkio_group+0x17/0x41 [] blkiocg_destroy+0x72/0xc7 [] cgroup_diput+0x4a/0xb2 [] dentry_iput+0x93/0xb7 [] d_kill+0x1c/0x36 [] dput+0xf5/0xfe [] do_rmdir+0x95/0xbe [] sys_rmdir+0x10/0x12 [] sysenter_do_call+0x12/0x32 -> #0 (blkio_list_lock){+.+...}: [] validate_chain+0x61c/0xb9c [] __lock_acquire+0x723/0x789 [] lock_acquire+0x90/0xa7 [] _raw_spin_lock+0x1e/0x4e [] blkiocg_weight_write+0x82/0x9e [] cgroup_file_write+0xc6/0x1c0 [] vfs_write+0x8c/0x116 [] sys_write+0x3b/0x60 [] sysenter_do_call+0x12/0x32 other info that might help us debug this: 1 lock held by test_io_control/7357: #0: (&(&blkcg->lock)->rlock){......}, at: [] blkiocg_weight_write+0x3b/0x9e stack backtrace: Pid: 7357, comm: test_io_control Not tainted 2.6.33-rc2 #1 Call Trace: [] print_circular_bug+0x91/0x9d [] validate_chain+0x61c/0xb9c [] __lock_acquire+0x723/0x789 [] lock_acquire+0x90/0xa7 [] ? blkiocg_weight_write+0x82/0x9e [] _raw_spin_lock+0x1e/0x4e [] ? blkiocg_weight_write+0x82/0x9e [] blkiocg_weight_write+0x82/0x9e [] cgroup_file_write+0xc6/0x1c0 [] ? trace_hardirqs_off+0xb/0xd [] ? cpu_clock+0x2e/0x44 [] ? security_file_permission+0xf/0x11 [] ? rw_verify_area+0x8a/0xad [] ? cgroup_file_write+0x0/0x1c0 [] vfs_write+0x8c/0x116 [] sys_write+0x3b/0x60 [] sysenter_do_call+0x12/0x32 To prevent deadlock, we should take locks as following sequence: blkio_list_lock -> queue_lock -> blkcg_lock. The following patch should fix this bug. Signed-off-by: Gui Jianfeng Signed-off-by: Jens Axboe --- block/blk-cgroup.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) (limited to 'block') diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c index 1fa2654db0a6..e7dbbaf5fb3e 100644 --- a/block/blk-cgroup.c +++ b/block/blk-cgroup.c @@ -147,16 +147,16 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val) return -EINVAL; blkcg = cgroup_to_blkio_cgroup(cgroup); + spin_lock(&blkio_list_lock); spin_lock_irq(&blkcg->lock); blkcg->weight = (unsigned int)val; hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) { - spin_lock(&blkio_list_lock); list_for_each_entry(blkiop, &blkio_list, list) blkiop->ops.blkio_update_group_weight_fn(blkg, blkcg->weight); - spin_unlock(&blkio_list_lock); } spin_unlock_irq(&blkcg->lock); + spin_unlock(&blkio_list_lock); return 0; } -- cgit 1.4.1 From 1efe8fe1c2240acc476bed77740883df63373862 Mon Sep 17 00:00:00 2001 From: Vivek Goyal Date: Tue, 2 Feb 2010 20:45:46 +0100 Subject: cfq-iosched: Do not idle on async queues Few weeks back, Shaohua Li had posted similar patch. I am reposting it with more test results. This patch does two things. - Do not idle on async queues. - It also changes the write queue depth CFQ drives (cfq_may_dispatch()). Currently, we seem to driving queue depth of 1 always for WRITES. This is true even if there is only one write queue in the system and all the logic of infinite queue depth in case of single busy queue as well as slowly increasing queue depth based on last delayed sync request does not seem to be kicking in at all. This patch will allow deeper WRITE queue depths (subjected to the other WRITE queue depth contstraints like cfq_quantum and last delayed sync request). Shaohua Li had reported getting more out of his SSD. For me, I have got one Lun exported from an HP EVA and when pure buffered writes are on, I can get more out of the system. Following are test results of pure buffered writes (with end_fsync=1) with vanilla and patched kernel. These results are average of 3 sets of run with increasing number of threads. AVERAGE[bufwfs][vanilla] ------- job Set NR ReadBW(KB/s) MaxClat(us) WriteBW(KB/s) MaxClat(us) --- --- -- ------------ ----------- ------------- ----------- bufwfs 3 1 0 0 95349 474141 bufwfs 3 2 0 0 100282 806926 bufwfs 3 4 0 0 109989 2.7301e+06 bufwfs 3 8 0 0 116642 3762231 bufwfs 3 16 0 0 118230 6902970 AVERAGE[bufwfs] [patched kernel] ------- bufwfs 3 1 0 0 270722 404352 bufwfs 3 2 0 0 206770 1.06552e+06 bufwfs 3 4 0 0 195277 1.62283e+06 bufwfs 3 8 0 0 260960 2.62979e+06 bufwfs 3 16 0 0 299260 1.70731e+06 I also ran buffered writes along with some sequential reads and some buffered reads going on in the system on a SATA disk because the potential risk could be that we should not be driving queue depth higher in presence of sync IO going to keep the max clat low. With some random and sequential reads going on in the system on one SATA disk I did not see any significant increase in max clat. So it looks like other WRITE queue depth control logic is doing its job. Here are the results. AVERAGE[brr, bsr, bufw together] [vanilla] ------- job Set NR ReadBW(KB/s) MaxClat(us) WriteBW(KB/s) MaxClat(us) --- --- -- ------------ ----------- ------------- ----------- brr 3 1 850 546345 0 0 bsr 3 1 14650 729543 0 0 bufw 3 1 0 0 23908 8274517 brr 3 2 981.333 579395 0 0 bsr 3 2 14149.7 1175689 0 0 bufw 3 2 0 0 21921 1.28108e+07 brr 3 4 898.333 1.75527e+06 0 0 bsr 3 4 12230.7 1.40072e+06 0 0 bufw 3 4 0 0 19722.3 2.4901e+07 brr 3 8 900 3160594 0 0 bsr 3 8 9282.33 1.91314e+06 0 0 bufw 3 8 0 0 18789.3 23890622 AVERAGE[brr, bsr, bufw mixed] [patched kernel] ------- job Set NR ReadBW(KB/s) MaxClat(us) WriteBW(KB/s) MaxClat(us) --- --- -- ------------ ----------- ------------- ----------- brr 3 1 837 417973 0 0 bsr 3 1 14357.7 591275 0 0 bufw 3 1 0 0 24869.7 8910662 brr 3 2 1038.33 543434 0 0 bsr 3 2 13351.3 1205858 0 0 bufw 3 2 0 0 18626.3 13280370 brr 3 4 913 1.86861e+06 0 0 bsr 3 4 12652.3 1430974 0 0 bufw 3 4 0 0 15343.3 2.81305e+07 brr 3 8 890 2.92695e+06 0 0 bsr 3 8 9635.33 1.90244e+06 0 0 bufw 3 8 0 0 17200.3 24424392 So looks like it might make sense to include this patch. Thanks Vivek Signed-off-by: Vivek Goyal Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index ee130f14d1fc..17b768d0d42f 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -1803,7 +1803,7 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq) * Otherwise, we do only if they are the last ones * in their service tree. */ - return service_tree->count == 1; + return service_tree->count == 1 && cfq_cfqq_sync(cfqq); } static void cfq_arm_slice_timer(struct cfq_data *cfqd) -- cgit 1.4.1 From ae54abed636d18f7939c965f21ad126001dbe34c Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 5 Feb 2010 13:11:45 +0100 Subject: cfq-iosched: split seeky coop queues after one slice Currently we split seeky coop queues after 1s, which is too big. Below patch marks seeky coop queue split_coop flag after one slice. After that, if new requests come in, the queues will be splitted. Patch is suggested by Corrado. Signed-off-by: Shaohua Li Reviewed-by: Corrado Zoccolo Acked-by: Jeff Moyer Signed-off-by: Jens Axboe --- block/cfq-iosched.c | 49 ++++++++++++++++--------------------------------- 1 file changed, 16 insertions(+), 33 deletions(-) (limited to 'block') diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c index 17b768d0d42f..023f4e69a337 100644 --- a/block/cfq-iosched.c +++ b/block/cfq-iosched.c @@ -42,16 +42,13 @@ static const int cfq_hist_divisor = 4; */ #define CFQ_MIN_TT (2) -/* - * Allow merged cfqqs to perform this amount of seeky I/O before - * deciding to break the queues up again. - */ -#define CFQQ_COOP_TOUT (HZ) - #define CFQ_SLICE_SCALE (5) #define CFQ_HW_QUEUE_MIN (5) #define CFQ_SERVICE_SHIFT 12 +#define CFQQ_SEEK_THR 8 * 1024 +#define CFQQ_SEEKY(cfqq) ((cfqq)->seek_mean > CFQQ_SEEK_THR) + #define RQ_CIC(rq) \ ((struct cfq_io_context *) (rq)->elevator_private) #define RQ_CFQQ(rq) (struct cfq_queue *) ((rq)->elevator_private2) @@ -137,7 +134,6 @@ struct cfq_queue { u64 seek_total; sector_t seek_mean; sector_t last_request_pos; - unsigned long seeky_start; pid_t pid; @@ -314,6 +310,7 @@ enum cfqq_state_flags { CFQ_CFQQ_FLAG_slice_new, /* no requests dispatched in slice */ CFQ_CFQQ_FLAG_sync, /* synchronous queue */ CFQ_CFQQ_FLAG_coop, /* cfqq is shared */ + CFQ_CFQQ_FLAG_split_coop, /* shared cfqq will be splitted */ CFQ_CFQQ_FLAG_deep, /* sync cfqq experienced large depth */ CFQ_CFQQ_FLAG_wait_busy, /* Waiting for next request */ }; @@ -342,6 +339,7 @@ CFQ_CFQQ_FNS(prio_changed); CFQ_CFQQ_FNS(slice_new); CFQ_CFQQ_FNS(sync); CFQ_CFQQ_FNS(coop); +CFQ_CFQQ_FNS(split_coop); CFQ_CFQQ_FNS(deep); CFQ_CFQQ_FNS(wait_busy); #undef CFQ_CFQQ_FNS @@ -1565,6 +1563,15 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq, cfq_clear_cfqq_wait_request(cfqq); cfq_clear_cfqq_wait_busy(cfqq); + /* + * If this cfqq is shared between multiple processes, check to + * make sure that those processes are still issuing I/Os within + * the mean seek distance. If not, it may be time to break the + * queues apart again. + */ + if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq)) + cfq_mark_cfqq_split_coop(cfqq); + /* * store what was left of this slice, if the queue idled/timed out */ @@ -1663,9 +1670,6 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd, return cfqd->last_position - blk_rq_pos(rq); } -#define CFQQ_SEEK_THR 8 * 1024 -#define CFQQ_SEEKY(cfqq) ((cfqq)->seek_mean > CFQQ_SEEK_THR) - static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq, struct request *rq, bool for_preempt) { @@ -3000,19 +3004,6 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq, total = cfqq->seek_total + (cfqq->seek_samples/2); do_div(total, cfqq->seek_samples); cfqq->seek_mean = (sector_t)total; - - /* - * If this cfqq is shared between multiple processes, check to - * make sure that those processes are still issuing I/Os within - * the mean seek distance. If not, it may be time to break the - * queues apart again. - */ - if (cfq_cfqq_coop(cfqq)) { - if (CFQQ_SEEKY(cfqq) && !cfqq->seeky_start) - cfqq->seeky_start = jiffies; - else if (!CFQQ_SEEKY(cfqq)) - cfqq->seeky_start = 0; - } } /* @@ -3453,14 +3444,6 @@ cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic, return cic_to_cfqq(cic, 1); } -static int should_split_cfqq(struct cfq_queue *cfqq) -{ - if (cfqq->seeky_start && - time_after(jiffies, cfqq->seeky_start + CFQQ_COOP_TOUT)) - return 1; - return 0; -} - /* * Returns NULL if a new cfqq should be allocated, or the old cfqq if this * was the last process referring to said cfqq. @@ -3469,9 +3452,9 @@ static struct cfq_queue * split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq) { if (cfqq_process_refs(cfqq) == 1) { - cfqq->seeky_start = 0; cfqq->pid = current->pid; cfq_clear_cfqq_coop(cfqq); + cfq_clear_cfqq_split_coop(cfqq); return cfqq; } @@ -3510,7 +3493,7 @@ new_queue: /* * If the queue was seeky for too long, break it apart. */ - if (cfq_cfqq_coop(cfqq) && should_split_cfqq(cfqq)) { + if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) { cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq"); cfqq = split_cfqq(cic, cfqq); if (!cfqq) -- cgit 1.4.1