From 227f052f4711caf432b9a7dbcfe1a2857d3c0def Mon Sep 17 00:00:00 2001
From: Lars Ellenberg <lars.ellenberg@linbit.com>
Date: Tue, 31 Jul 2012 09:31:11 +0200
Subject: drbd: fix drbd wire compatibility for empty flushes

DRBD has a concept of request epochs or reorder-domains,
which are separated on the wire by P_BARRIER packets.

Older DRBD is not able to handle zero-sized requests at all,
so we need to map empty flushes to these drbd barriers.

These are the equivalent of empty flushes, and
by default trigger flushes on the receiving side anyways
(unless not supported or explicitly disabled),
so there is no need to handle this differently in newer drbd either.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_req.c | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 910335c30927..0bb1e41f136f 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -834,7 +834,15 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
 		req->private_bio = NULL;
 	}
 	if (rw == WRITE) {
-		remote = 1;
+		/* Need to replicate writes.  Unless it is an empty flush,
+		 * which is better mapped to a DRBD P_BARRIER packet,
+		 * also for drbd wire protocol compatibility reasons. */
+		if (unlikely(size == 0)) {
+			/* The only size==0 bios we expect are empty flushes. */
+			D_ASSERT(bio->bi_rw & REQ_FLUSH);
+			remote = 0;
+		} else
+			remote = 1;
 	} else {
 		/* READ || READA */
 		if (local) {
@@ -870,8 +878,11 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
 	 * extent.  This waits for any resync activity in the corresponding
 	 * resync extent to finish, and, if necessary, pulls in the target
 	 * extent into the activity log, which involves further disk io because
-	 * of transactional on-disk meta data updates. */
-	if (rw == WRITE && local && !test_bit(AL_SUSPENDED, &mdev->flags)) {
+	 * of transactional on-disk meta data updates.
+	 * Empty flushes don't need to go into the activity log, they can only
+	 * flush data for pending writes which are already in there. */
+	if (rw == WRITE && local && size
+	&& !test_bit(AL_SUSPENDED, &mdev->flags)) {
 		req->rq_state |= RQ_IN_ACT_LOG;
 		drbd_al_begin_io(mdev, sector);
 	}
@@ -994,7 +1005,10 @@ allocate_barrier:
 	if (rw == WRITE && _req_conflicts(req))
 		goto fail_conflicting;
 
-	list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
+	/* no point in adding empty flushes to the transfer log,
+	 * they are mapped to drbd barriers already. */
+	if (likely(size!=0))
+		list_add_tail(&req->tl_requests, &mdev->newest_tle->requests);
 
 	/* NOTE remote first: to get the concurrent write detection right,
 	 * we must register the request before start of local IO.  */
@@ -1014,6 +1028,14 @@ allocate_barrier:
 	    mdev->net_conf->on_congestion != OC_BLOCK && mdev->agreed_pro_version >= 96)
 		maybe_pull_ahead(mdev);
 
+	/* If this was a flush, queue a drbd barrier/start a new epoch.
+	 * Unless the current epoch was empty anyways, or we are not currently
+	 * replicating, in which case there is no point. */
+	if (unlikely(bio->bi_rw & REQ_FLUSH)
+		&& mdev->newest_tle->n_writes
+		&& drbd_should_do_remote(mdev->state))
+		queue_barrier(mdev);
+
 	spin_unlock_irq(&mdev->req_lock);
 	kfree(b); /* if someone else has beaten us to it... */
 
-- 
cgit 1.4.1


From 509fc019e534bdf5f3969d78c53184db4cf7ff48 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Tue, 31 Jul 2012 11:22:58 +0200
Subject: drbd: Finish requests that completed while IO was frozen

Requests of an acked epoch are stored on the barrier_acked_requests list. In
case the private bio of such a request completes while IO on the drbd device
is suspended [req_mod(completed_ok)] then the request stays there.

When thawing IO because the fence_peer handler returned, then we use
tl_clear() to apply the connection_lost_while_pending event to all requests
on the transfer-log and the barrier_acked_requests list.

Up to now the connection_lost_while_pending event was not applied
on requests on the barrier_acked_requests list. Fixed that.

I.e. now the connection_lost_while_pending and resend events are
applied to requests on the barrier_acked_requests list. For that
it is necessary that the resend event finishes (local only)
READS correctly.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_main.c | 28 ++++++++++++----------------
 drivers/block/drbd/drbd_req.c  |  6 ++++++
 2 files changed, 18 insertions(+), 16 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index 2e0e7fc1dbba..136db95b212d 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -79,6 +79,7 @@ static int w_md_sync(struct drbd_conf *mdev, struct drbd_work *w, int unused);
 static void md_sync_timer_fn(unsigned long data);
 static int w_bitmap_io(struct drbd_conf *mdev, struct drbd_work *w, int unused);
 static int w_go_diskless(struct drbd_conf *mdev, struct drbd_work *w, int unused);
+static void _tl_clear(struct drbd_conf *mdev);
 
 MODULE_AUTHOR("Philipp Reisner <phil@linbit.com>, "
 	      "Lars Ellenberg <lars@linbit.com>");
@@ -432,19 +433,10 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
 
 	/* Actions operating on the disk state, also want to work on
 	   requests that got barrier acked. */
-	switch (what) {
-	case fail_frozen_disk_io:
-	case restart_frozen_disk_io:
-		list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
-			req = list_entry(le, struct drbd_request, tl_requests);
-			_req_mod(req, what);
-		}
 
-	case connection_lost_while_pending:
-	case resend:
-		break;
-	default:
-		dev_err(DEV, "what = %d in _tl_restart()\n", what);
+	list_for_each_safe(le, tle, &mdev->barrier_acked_requests) {
+		req = list_entry(le, struct drbd_request, tl_requests);
+		_req_mod(req, what);
 	}
 }
 
@@ -458,12 +450,17 @@ static void _tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
  * receiver thread and the worker thread.
  */
 void tl_clear(struct drbd_conf *mdev)
+{
+	spin_lock_irq(&mdev->req_lock);
+	_tl_clear(mdev);
+	spin_unlock_irq(&mdev->req_lock);
+}
+
+static void _tl_clear(struct drbd_conf *mdev)
 {
 	struct list_head *le, *tle;
 	struct drbd_request *r;
 
-	spin_lock_irq(&mdev->req_lock);
-
 	_tl_restart(mdev, connection_lost_while_pending);
 
 	/* we expect this list to be empty. */
@@ -482,7 +479,6 @@ void tl_clear(struct drbd_conf *mdev)
 
 	memset(mdev->app_reads_hash, 0, APP_R_HSIZE*sizeof(void *));
 
-	spin_unlock_irq(&mdev->req_lock);
 }
 
 void tl_restart(struct drbd_conf *mdev, enum drbd_req_event what)
@@ -1476,12 +1472,12 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
 	if (ns.susp_fen) {
 		/* case1: The outdate peer handler is successful: */
 		if (os.pdsk > D_OUTDATED  && ns.pdsk <= D_OUTDATED) {
-			tl_clear(mdev);
 			if (test_bit(NEW_CUR_UUID, &mdev->flags)) {
 				drbd_uuid_new_current(mdev);
 				clear_bit(NEW_CUR_UUID, &mdev->flags);
 			}
 			spin_lock_irq(&mdev->req_lock);
+			_tl_clear(mdev);
 			_drbd_set_state(_NS(mdev, susp_fen, 0), CS_VERBOSE, NULL);
 			spin_unlock_irq(&mdev->req_lock);
 		}
diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c
index 0bb1e41f136f..01b2ac641c7b 100644
--- a/drivers/block/drbd/drbd_req.c
+++ b/drivers/block/drbd/drbd_req.c
@@ -695,6 +695,12 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
 		break;
 
 	case resend:
+		/* Simply complete (local only) READs. */
+		if (!(req->rq_state & RQ_WRITE) && !req->w.cb) {
+			_req_may_be_done(req, m);
+			break;
+		}
+
 		/* If RQ_NET_OK is already set, we got a P_WRITE_ACK or P_RECV_ACK
 		   before the connection loss (B&C only); only P_BARRIER_ACK was missing.
 		   Trowing them out of the TL here by pretending we got a BARRIER_ACK
-- 
cgit 1.4.1


From d1aa4d04da8de5c89d73859e077d89c4c71d8ed1 Mon Sep 17 00:00:00 2001
From: Philipp Reisner <philipp.reisner@linbit.com>
Date: Wed, 8 Aug 2012 21:19:09 +0200
Subject: drbd: Write all pages of the bitmap after an online resize

We need to write the whole bitmap after we moved the meta data
due to an online resize operation.

With the support for one peta byte devices bitmap IO was optimized
to only write out touched pages. This optimization must be turned
off when writing the bitmap after an online resize.

This issue was introduced with drbd-8.3.10.

The impact of this bug is that after an online resize, the next
resync could become larger than expected.

Signed-off-by: Philipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: Lars Ellenberg <lars.ellenberg@linbit.com>
---
 drivers/block/drbd/drbd_bitmap.c | 15 ++++++++++++++-
 drivers/block/drbd/drbd_int.h    |  1 +
 drivers/block/drbd/drbd_nl.c     |  4 ++--
 3 files changed, 17 insertions(+), 3 deletions(-)

(limited to 'drivers')

diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c
index ba91b408abad..d84566496746 100644
--- a/drivers/block/drbd/drbd_bitmap.c
+++ b/drivers/block/drbd/drbd_bitmap.c
@@ -889,6 +889,7 @@ struct bm_aio_ctx {
 	unsigned int done;
 	unsigned flags;
 #define BM_AIO_COPY_PAGES	1
+#define BM_WRITE_ALL_PAGES	2
 	int error;
 	struct kref kref;
 };
@@ -1059,7 +1060,8 @@ static int bm_rw(struct drbd_conf *mdev, int rw, unsigned flags, unsigned lazy_w
 		if (lazy_writeout_upper_idx && i == lazy_writeout_upper_idx)
 			break;
 		if (rw & WRITE) {
-			if (bm_test_page_unchanged(b->bm_pages[i])) {
+			if (!(flags & BM_WRITE_ALL_PAGES) &&
+			    bm_test_page_unchanged(b->bm_pages[i])) {
 				dynamic_dev_dbg(DEV, "skipped bm write for idx %u\n", i);
 				continue;
 			}
@@ -1140,6 +1142,17 @@ int drbd_bm_write(struct drbd_conf *mdev) __must_hold(local)
 	return bm_rw(mdev, WRITE, 0, 0);
 }
 
+/**
+ * drbd_bm_write_all() - Write the whole bitmap to its on disk location.
+ * @mdev:	DRBD device.
+ *
+ * Will write all pages.
+ */
+int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local)
+{
+	return bm_rw(mdev, WRITE, BM_WRITE_ALL_PAGES, 0);
+}
+
 /**
  * drbd_bm_lazy_write_out() - Write bitmap pages 0 to @upper_idx-1, if they have changed.
  * @mdev:	DRBD device.
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index b2ca143d0053..b953cc7c9c00 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -1469,6 +1469,7 @@ extern int  drbd_bm_e_weight(struct drbd_conf *mdev, unsigned long enr);
 extern int  drbd_bm_write_page(struct drbd_conf *mdev, unsigned int idx) __must_hold(local);
 extern int  drbd_bm_read(struct drbd_conf *mdev) __must_hold(local);
 extern int  drbd_bm_write(struct drbd_conf *mdev) __must_hold(local);
+extern int drbd_bm_write_all(struct drbd_conf *mdev) __must_hold(local);
 extern int  drbd_bm_write_copy_pages(struct drbd_conf *mdev) __must_hold(local);
 extern unsigned long drbd_bm_ALe_set_all(struct drbd_conf *mdev,
 		unsigned long al_enr);
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index fb9dce8daa24..edb490aad8b4 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -674,8 +674,8 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
 			 la_size_changed && md_moved ? "size changed and md moved" :
 			 la_size_changed ? "size changed" : "md moved");
 		/* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
-		err = drbd_bitmap_io(mdev, &drbd_bm_write,
-				"size changed", BM_LOCKED_MASK);
+		err = drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
+				     "size changed", BM_LOCKED_MASK);
 		if (err) {
 			rv = dev_size_error;
 			goto out;
-- 
cgit 1.4.1