From 875feb63b9567442be73efbcc9a8470e376d6423 Mon Sep 17 00:00:00 2001
From: Divyesh Shah <dpshah@google.com>
Date: Wed, 6 Jan 2010 18:58:20 -0800
Subject: cfq-iosched: Respect ioprio_class when preempting

In cfq_should_preempt(), we currently allow some cases where a non-RT request
can preempt an ongoing RT cfqq timeslice. This should not happen.
Examples include:

o A sync_noidle wl type non-RT request pre-empting a sync_noidle wl type cfqq
  on which we are idling.
o Once we have per-cgroup async queues, a non-RT sync request pre-empting a RT
  async cfqq.

Signed-off-by: Divyesh Shah<dpshah@google.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 6 ++++++
 1 file changed, 6 insertions(+)

(limited to 'block')
diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 918c7fd9aeb1..ee130f14d1fc 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -3076,6 +3076,12 @@ cfq_should_preempt(struct cfq_data *cfqd, struct cfq_queue *new_cfqq,
 	if (cfq_class_idle(cfqq))
 		return true;
 
+	/*
+	 * Don't allow a non-RT request to preempt an ongoing RT cfqq timeslice.
+	 */
+	if (cfq_class_rt(cfqq) && !cfq_class_rt(new_cfqq))
+		return false;
+
 	/*
 	 * if the new request is sync, but the currently running queue is
 	 * not, let the sync request have priority.
-- 
cgit v1.2.3


From bcf4dd43424cdfd8195f3955300a579fe58e9911 Mon Sep 17 00:00:00 2001
From: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Date: Mon, 1 Feb 2010 09:58:54 +0100
Subject: blk-cgroup: Fix potential deadlock in blk-cgroup

I triggered a lockdep warning as following.

=======================================================
[ INFO: possible circular locking dependency detected ]
2.6.33-rc2 #1
-------------------------------------------------------
test_io_control/7357 is trying to acquire lock:
 (blkio_list_lock){+.+...}, at: [<c053a990>] blkiocg_weight_write+0x82/0x9e

but task is already holding lock:
 (&(&blkcg->lock)->rlock){......}, at: [<c053a949>] blkiocg_weight_write+0x3b/0x9e

which lock already depends on the new lock.

the existing dependency chain (in reverse order) is:

-> #2 (&(&blkcg->lock)->rlock){......}:
       [<c04583b7>] validate_chain+0x8bc/0xb9c
       [<c0458dba>] __lock_acquire+0x723/0x789
       [<c0458eb0>] lock_acquire+0x90/0xa7
       [<c0692b0a>] _raw_spin_lock_irqsave+0x27/0x5a
       [<c053a4e1>] blkiocg_add_blkio_group+0x1a/0x6d
       [<c053cac7>] cfq_get_queue+0x225/0x3de
       [<c053eec2>] cfq_set_request+0x217/0x42d
       [<c052c8a6>] elv_set_request+0x17/0x26
       [<c0532a0f>] get_request+0x203/0x2c5
       [<c0532ae9>] get_request_wait+0x18/0x10e
       [<c0533470>] __make_request+0x2ba/0x375
       [<c0531985>] generic_make_request+0x28d/0x30f
       [<c0532da7>] submit_bio+0x8a/0x8f
       [<c04d827a>] submit_bh+0xf0/0x10f
       [<c04d91d2>] ll_rw_block+0xc0/0xf9
       [<f86e9705>] ext3_find_entry+0x319/0x544 [ext3]
       [<f86eae58>] ext3_lookup+0x2c/0xb9 [ext3]
       [<c04c3e1b>] do_lookup+0xd3/0x172
       [<c04c56c8>] link_path_walk+0x5fb/0x95c
       [<c04c5a65>] path_walk+0x3c/0x81
       [<c04c5b63>] do_path_lookup+0x21/0x8a
       [<c04c66cc>] do_filp_open+0xf0/0x978
       [<c04c0c7e>] open_exec+0x1b/0xb7
       [<c04c1436>] do_execve+0xbb/0x266
       [<c04081a9>] sys_execve+0x24/0x4a
       [<c04028a2>] ptregs_execve+0x12/0x18

-> #1 (&(&q->__queue_lock)->rlock){..-.-.}:
       [<c04583b7>] validate_chain+0x8bc/0xb9c
       [<c0458dba>] __lock_acquire+0x723/0x789
       [<c0458eb0>] lock_acquire+0x90/0xa7
       [<c0692b0a>] _raw_spin_lock_irqsave+0x27/0x5a
       [<c053dd2a>] cfq_unlink_blkio_group+0x17/0x41
       [<c053a6eb>] blkiocg_destroy+0x72/0xc7
       [<c0467df0>] cgroup_diput+0x4a/0xb2
       [<c04ca473>] dentry_iput+0x93/0xb7
       [<c04ca4b3>] d_kill+0x1c/0x36
       [<c04cb5c5>] dput+0xf5/0xfe
       [<c04c6084>] do_rmdir+0x95/0xbe
       [<c04c60ec>] sys_rmdir+0x10/0x12
       [<c04027cc>] sysenter_do_call+0x12/0x32

-> #0 (blkio_list_lock){+.+...}:
       [<c0458117>] validate_chain+0x61c/0xb9c
       [<c0458dba>] __lock_acquire+0x723/0x789
       [<c0458eb0>] lock_acquire+0x90/0xa7
       [<c06929fd>] _raw_spin_lock+0x1e/0x4e
       [<c053a990>] blkiocg_weight_write+0x82/0x9e
       [<c0467f1e>] cgroup_file_write+0xc6/0x1c0
       [<c04bd2f3>] vfs_write+0x8c/0x116
       [<c04bd7c6>] sys_write+0x3b/0x60
       [<c04027cc>] sysenter_do_call+0x12/0x32

other info that might help us debug this:

1 lock held by test_io_control/7357:
 #0:  (&(&blkcg->lock)->rlock){......}, at: [<c053a949>] blkiocg_weight_write+0x3b/0x9e
stack backtrace:
Pid: 7357, comm: test_io_control Not tainted 2.6.33-rc2 #1
Call Trace:
 [<c045754f>] print_circular_bug+0x91/0x9d
 [<c0458117>] validate_chain+0x61c/0xb9c
 [<c0458dba>] __lock_acquire+0x723/0x789
 [<c0458eb0>] lock_acquire+0x90/0xa7
 [<c053a990>] ? blkiocg_weight_write+0x82/0x9e
 [<c06929fd>] _raw_spin_lock+0x1e/0x4e
 [<c053a990>] ? blkiocg_weight_write+0x82/0x9e
 [<c053a990>] blkiocg_weight_write+0x82/0x9e
 [<c0467f1e>] cgroup_file_write+0xc6/0x1c0
 [<c0454df5>] ? trace_hardirqs_off+0xb/0xd
 [<c044d93a>] ? cpu_clock+0x2e/0x44
 [<c050e6ec>] ? security_file_permission+0xf/0x11
 [<c04bcdda>] ? rw_verify_area+0x8a/0xad
 [<c0467e58>] ? cgroup_file_write+0x0/0x1c0
 [<c04bd2f3>] vfs_write+0x8c/0x116
 [<c04bd7c6>] sys_write+0x3b/0x60
 [<c04027cc>] sysenter_do_call+0x12/0x32

To prevent deadlock, we should take locks as following sequence:

blkio_list_lock -> queue_lock ->  blkcg_lock.

The following patch should fix this bug.

Signed-off-by: Gui Jianfeng <guijianfeng@cn.fujitsu.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/blk-cgroup.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

(limited to 'block')

diff --git a/block/blk-cgroup.c b/block/blk-cgroup.c
index 1fa2654db0a6..e7dbbaf5fb3e 100644
--- a/block/blk-cgroup.c
+++ b/block/blk-cgroup.c
@@ -147,16 +147,16 @@ blkiocg_weight_write(struct cgroup *cgroup, struct cftype *cftype, u64 val)
 		return -EINVAL;
 
 	blkcg = cgroup_to_blkio_cgroup(cgroup);
+	spin_lock(&blkio_list_lock);
 	spin_lock_irq(&blkcg->lock);
 	blkcg->weight = (unsigned int)val;
 	hlist_for_each_entry(blkg, n, &blkcg->blkg_list, blkcg_node) {
-		spin_lock(&blkio_list_lock);
 		list_for_each_entry(blkiop, &blkio_list, list)
 			blkiop->ops.blkio_update_group_weight_fn(blkg,
 					blkcg->weight);
-		spin_unlock(&blkio_list_lock);
 	}
 	spin_unlock_irq(&blkcg->lock);
+	spin_unlock(&blkio_list_lock);
 	return 0;
 }
 
-- 
cgit v1.2.3


From 1efe8fe1c2240acc476bed77740883df63373862 Mon Sep 17 00:00:00 2001
From: Vivek Goyal <vgoyal@redhat.com>
Date: Tue, 2 Feb 2010 20:45:46 +0100
Subject: cfq-iosched: Do not idle on async queues

Few weeks back, Shaohua Li had posted similar patch. I am reposting it
with more test results.

This patch does two things.

- Do not idle on async queues.

- It also changes the write queue depth CFQ drives (cfq_may_dispatch()).
  Currently, we seem to driving queue depth of 1 always for WRITES. This is
  true even if there is only one write queue in the system and all the logic
  of infinite queue depth in case of single busy queue as well as slowly
  increasing queue depth based on last delayed sync request does not seem to
  be kicking in at all.

This patch will allow deeper WRITE queue depths (subjected to the other
WRITE queue depth contstraints like cfq_quantum and last delayed sync
request).

Shaohua Li had reported getting more out of his SSD. For me, I have got
one Lun exported from an HP EVA and when pure buffered writes are on, I
can get more out of the system. Following are test results of pure
buffered writes (with end_fsync=1) with vanilla and patched kernel. These
results are average of 3 sets of run with increasing number of threads.

AVERAGE[bufwfs][vanilla]
-------
job       Set NR  ReadBW(KB/s)   MaxClat(us)    WriteBW(KB/s)  MaxClat(us)
---       --- --  ------------   -----------    -------------  -----------
bufwfs    3   1   0              0              95349          474141
bufwfs    3   2   0              0              100282         806926
bufwfs    3   4   0              0              109989         2.7301e+06
bufwfs    3   8   0              0              116642         3762231
bufwfs    3   16  0              0              118230         6902970

AVERAGE[bufwfs] [patched kernel]
-------
bufwfs    3   1   0              0              270722         404352
bufwfs    3   2   0              0              206770         1.06552e+06
bufwfs    3   4   0              0              195277         1.62283e+06
bufwfs    3   8   0              0              260960         2.62979e+06
bufwfs    3   16  0              0              299260         1.70731e+06

I also ran buffered writes along with some sequential reads and some
buffered reads going on in the system on a SATA disk because the potential
risk could be that we should not be driving queue depth higher in presence
of sync IO going to keep the max clat low.

With some random and sequential reads going on in the system on one SATA
disk I did not see any significant increase in max clat. So it looks like
other WRITE queue depth control logic is doing its job. Here are the
results.

AVERAGE[brr, bsr, bufw together] [vanilla]
-------
job       Set NR  ReadBW(KB/s)   MaxClat(us)    WriteBW(KB/s)  MaxClat(us)
---       --- --  ------------   -----------    -------------  -----------
brr       3   1   850            546345         0              0
bsr       3   1   14650          729543         0              0
bufw      3   1   0              0              23908          8274517

brr       3   2   981.333        579395         0              0
bsr       3   2   14149.7        1175689        0              0
bufw      3   2   0              0              21921          1.28108e+07

brr       3   4   898.333        1.75527e+06    0              0
bsr       3   4   12230.7        1.40072e+06    0              0
bufw      3   4   0              0              19722.3        2.4901e+07

brr       3   8   900            3160594        0              0
bsr       3   8   9282.33        1.91314e+06    0              0
bufw      3   8   0              0              18789.3        23890622

AVERAGE[brr, bsr, bufw mixed] [patched kernel]
-------
job       Set NR  ReadBW(KB/s)   MaxClat(us)    WriteBW(KB/s)  MaxClat(us)
---       --- --  ------------   -----------    -------------  -----------
brr       3   1   837            417973         0              0
bsr       3   1   14357.7        591275         0              0
bufw      3   1   0              0              24869.7        8910662

brr       3   2   1038.33        543434         0              0
bsr       3   2   13351.3        1205858        0              0
bufw      3   2   0              0              18626.3        13280370

brr       3   4   913            1.86861e+06    0              0
bsr       3   4   12652.3        1430974        0              0
bufw      3   4   0              0              15343.3        2.81305e+07

brr       3   8   890            2.92695e+06    0              0
bsr       3   8   9635.33        1.90244e+06    0              0
bufw      3   8   0              0              17200.3        24424392

So looks like it might make sense to include this patch.

Thanks
Vivek

Signed-off-by: Vivek Goyal <vgoyal@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index ee130f14d1fc..17b768d0d42f 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -1803,7 +1803,7 @@ static bool cfq_should_idle(struct cfq_data *cfqd, struct cfq_queue *cfqq)
 	 * Otherwise, we do only if they are the last ones
 	 * in their service tree.
 	 */
-	return service_tree->count == 1;
+	return service_tree->count == 1 && cfq_cfqq_sync(cfqq);
 }
 
 static void cfq_arm_slice_timer(struct cfq_data *cfqd)
-- 
cgit v1.2.3


From ae54abed636d18f7939c965f21ad126001dbe34c Mon Sep 17 00:00:00 2001
From: Shaohua Li <shaohua.li@intel.com>
Date: Fri, 5 Feb 2010 13:11:45 +0100
Subject: cfq-iosched: split seeky coop queues after one slice

Currently we split seeky coop queues after 1s, which is too big. Below patch
marks seeky coop queue split_coop flag after one slice. After that, if new
requests come in, the queues will be splitted. Patch is suggested by Corrado.

Signed-off-by: Shaohua Li <shaohua.li@intel.com>
Reviewed-by: Corrado Zoccolo <czoccolo@gmail.com>
Acked-by: Jeff Moyer <jmoyer@redhat.com>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
---
 block/cfq-iosched.c | 49 ++++++++++++++++---------------------------------
 1 file changed, 16 insertions(+), 33 deletions(-)

(limited to 'block')

diff --git a/block/cfq-iosched.c b/block/cfq-iosched.c
index 17b768d0d42f..023f4e69a337 100644
--- a/block/cfq-iosched.c
+++ b/block/cfq-iosched.c
@@ -42,16 +42,13 @@ static const int cfq_hist_divisor = 4;
  */
 #define CFQ_MIN_TT		(2)
 
-/*
- * Allow merged cfqqs to perform this amount of seeky I/O before
- * deciding to break the queues up again.
- */
-#define CFQQ_COOP_TOUT		(HZ)
-
 #define CFQ_SLICE_SCALE		(5)
 #define CFQ_HW_QUEUE_MIN	(5)
 #define CFQ_SERVICE_SHIFT       12
 
+#define CFQQ_SEEK_THR		8 * 1024
+#define CFQQ_SEEKY(cfqq)	((cfqq)->seek_mean > CFQQ_SEEK_THR)
+
 #define RQ_CIC(rq)		\
 	((struct cfq_io_context *) (rq)->elevator_private)
 #define RQ_CFQQ(rq)		(struct cfq_queue *) ((rq)->elevator_private2)
@@ -137,7 +134,6 @@ struct cfq_queue {
 	u64 seek_total;
 	sector_t seek_mean;
 	sector_t last_request_pos;
-	unsigned long seeky_start;
 
 	pid_t pid;
 
@@ -314,6 +310,7 @@ enum cfqq_state_flags {
 	CFQ_CFQQ_FLAG_slice_new,	/* no requests dispatched in slice */
 	CFQ_CFQQ_FLAG_sync,		/* synchronous queue */
 	CFQ_CFQQ_FLAG_coop,		/* cfqq is shared */
+	CFQ_CFQQ_FLAG_split_coop,	/* shared cfqq will be splitted */
 	CFQ_CFQQ_FLAG_deep,		/* sync cfqq experienced large depth */
 	CFQ_CFQQ_FLAG_wait_busy,	/* Waiting for next request */
 };
@@ -342,6 +339,7 @@ CFQ_CFQQ_FNS(prio_changed);
 CFQ_CFQQ_FNS(slice_new);
 CFQ_CFQQ_FNS(sync);
 CFQ_CFQQ_FNS(coop);
+CFQ_CFQQ_FNS(split_coop);
 CFQ_CFQQ_FNS(deep);
 CFQ_CFQQ_FNS(wait_busy);
 #undef CFQ_CFQQ_FNS
@@ -1565,6 +1563,15 @@ __cfq_slice_expired(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	cfq_clear_cfqq_wait_request(cfqq);
 	cfq_clear_cfqq_wait_busy(cfqq);
 
+	/*
+	 * If this cfqq is shared between multiple processes, check to
+	 * make sure that those processes are still issuing I/Os within
+	 * the mean seek distance.  If not, it may be time to break the
+	 * queues apart again.
+	 */
+	if (cfq_cfqq_coop(cfqq) && CFQQ_SEEKY(cfqq))
+		cfq_mark_cfqq_split_coop(cfqq);
+
 	/*
 	 * store what was left of this slice, if the queue idled/timed out
 	 */
@@ -1663,9 +1670,6 @@ static inline sector_t cfq_dist_from_last(struct cfq_data *cfqd,
 		return cfqd->last_position - blk_rq_pos(rq);
 }
 
-#define CFQQ_SEEK_THR		8 * 1024
-#define CFQQ_SEEKY(cfqq)	((cfqq)->seek_mean > CFQQ_SEEK_THR)
-
 static inline int cfq_rq_close(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 			       struct request *rq, bool for_preempt)
 {
@@ -3000,19 +3004,6 @@ cfq_update_io_seektime(struct cfq_data *cfqd, struct cfq_queue *cfqq,
 	total = cfqq->seek_total + (cfqq->seek_samples/2);
 	do_div(total, cfqq->seek_samples);
 	cfqq->seek_mean = (sector_t)total;
-
-	/*
-	 * If this cfqq is shared between multiple processes, check to
-	 * make sure that those processes are still issuing I/Os within
-	 * the mean seek distance.  If not, it may be time to break the
-	 * queues apart again.
-	 */
-	if (cfq_cfqq_coop(cfqq)) {
-		if (CFQQ_SEEKY(cfqq) && !cfqq->seeky_start)
-			cfqq->seeky_start = jiffies;
-		else if (!CFQQ_SEEKY(cfqq))
-			cfqq->seeky_start = 0;
-	}
 }
 
 /*
@@ -3453,14 +3444,6 @@ cfq_merge_cfqqs(struct cfq_data *cfqd, struct cfq_io_context *cic,
 	return cic_to_cfqq(cic, 1);
 }
 
-static int should_split_cfqq(struct cfq_queue *cfqq)
-{
-	if (cfqq->seeky_start &&
-	    time_after(jiffies, cfqq->seeky_start + CFQQ_COOP_TOUT))
-		return 1;
-	return 0;
-}
-
 /*
  * Returns NULL if a new cfqq should be allocated, or the old cfqq if this
  * was the last process referring to said cfqq.
@@ -3469,9 +3452,9 @@ static struct cfq_queue *
 split_cfqq(struct cfq_io_context *cic, struct cfq_queue *cfqq)
 {
 	if (cfqq_process_refs(cfqq) == 1) {
-		cfqq->seeky_start = 0;
 		cfqq->pid = current->pid;
 		cfq_clear_cfqq_coop(cfqq);
+		cfq_clear_cfqq_split_coop(cfqq);
 		return cfqq;
 	}
 
@@ -3510,7 +3493,7 @@ new_queue:
 		/*
 		 * If the queue was seeky for too long, break it apart.
 		 */
-		if (cfq_cfqq_coop(cfqq) && should_split_cfqq(cfqq)) {
+		if (cfq_cfqq_coop(cfqq) && cfq_cfqq_split_coop(cfqq)) {
 			cfq_log_cfqq(cfqd, cfqq, "breaking apart cfqq");
 			cfqq = split_cfqq(cic, cfqq);
 			if (!cfqq)
-- 
cgit v1.2.3