summary refs log tree commit diff
path: root/drivers/block
diff options
context:
space:
mode:
Diffstat (limited to 'drivers/block')
-rw-r--r--drivers/block/Kconfig4
-rw-r--r--drivers/block/drbd/drbd_actlog.c21
-rw-r--r--drivers/block/drbd/drbd_int.h15
-rw-r--r--drivers/block/drbd/drbd_main.c61
-rw-r--r--drivers/block/drbd/drbd_nl.c185
-rw-r--r--drivers/block/drbd/drbd_receiver.c12
-rw-r--r--drivers/block/drbd/drbd_state.c4
-rw-r--r--drivers/block/rsxx/core.c359
-rw-r--r--drivers/block/rsxx/cregs.c14
-rw-r--r--drivers/block/rsxx/dev.c33
-rw-r--r--drivers/block/rsxx/dma.c185
-rw-r--r--drivers/block/rsxx/rsxx_priv.h10
-rw-r--r--drivers/block/xen-blkback/blkback.c872
-rw-r--r--drivers/block/xen-blkback/common.h147
-rw-r--r--drivers/block/xen-blkback/xenbus.c85
-rw-r--r--drivers/block/xen-blkfront.c532
16 files changed, 1915 insertions, 624 deletions
diff --git a/drivers/block/Kconfig b/drivers/block/Kconfig
index b81ddfea1da0..e07a5fd58ad7 100644
--- a/drivers/block/Kconfig
+++ b/drivers/block/Kconfig
@@ -532,11 +532,11 @@ config BLK_DEV_RBD
 	  If unsure, say N.
 
 config BLK_DEV_RSXX
-	tristate "IBM FlashSystem 70/80 PCIe SSD Device Driver"
+	tristate "IBM Flash Adapter 900GB Full Height PCIe Device Driver"
 	depends on PCI
 	help
 	  Device driver for IBM's high speed PCIe SSD
-	  storage devices: FlashSystem-70 and FlashSystem-80.
+	  storage device: Flash Adapter 900GB Full Height.
 
 	  To compile this driver as a module, choose M here: the
 	  module will be called rsxx.
diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c
index 6608076dc39e..28c73ca320a8 100644
--- a/drivers/block/drbd/drbd_actlog.c
+++ b/drivers/block/drbd/drbd_actlog.c
@@ -659,6 +659,27 @@ void drbd_al_shrink(struct drbd_conf *mdev)
 	wake_up(&mdev->al_wait);
 }
 
+int drbd_initialize_al(struct drbd_conf *mdev, void *buffer)
+{
+	struct al_transaction_on_disk *al = buffer;
+	struct drbd_md *md = &mdev->ldev->md;
+	sector_t al_base = md->md_offset + md->al_offset;
+	int al_size_4k = md->al_stripes * md->al_stripe_size_4k;
+	int i;
+
+	memset(al, 0, 4096);
+	al->magic = cpu_to_be32(DRBD_AL_MAGIC);
+	al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED);
+	al->crc32c = cpu_to_be32(crc32c(0, al, 4096));
+
+	for (i = 0; i < al_size_4k; i++) {
+		int err = drbd_md_sync_page_io(mdev, mdev->ldev, al_base + i * 8, WRITE);
+		if (err)
+			return err;
+	}
+	return 0;
+}
+
 static int w_update_odbm(struct drbd_work *w, int unused)
 {
 	struct update_odbm_work *udw = container_of(w, struct update_odbm_work, w);
diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h
index f943aacfdad8..2d7f608d181c 100644
--- a/drivers/block/drbd/drbd_int.h
+++ b/drivers/block/drbd/drbd_int.h
@@ -832,6 +832,7 @@ struct drbd_tconn {			/* is a resource from the config file */
 	unsigned susp_nod:1;		/* IO suspended because no data */
 	unsigned susp_fen:1;		/* IO suspended because fence peer handler runs */
 	struct mutex cstate_mutex;	/* Protects graceful disconnects */
+	unsigned int connect_cnt;	/* Inc each time a connection is established */
 
 	unsigned long flags;
 	struct net_conf *net_conf;	/* content protected by rcu */
@@ -1132,6 +1133,7 @@ extern void drbd_mdev_cleanup(struct drbd_conf *mdev);
 void drbd_print_uuids(struct drbd_conf *mdev, const char *text);
 
 extern void conn_md_sync(struct drbd_tconn *tconn);
+extern void drbd_md_write(struct drbd_conf *mdev, void *buffer);
 extern void drbd_md_sync(struct drbd_conf *mdev);
 extern int  drbd_md_read(struct drbd_conf *mdev, struct drbd_backing_dev *bdev);
 extern void drbd_uuid_set(struct drbd_conf *mdev, int idx, u64 val) __must_hold(local);
@@ -1466,8 +1468,16 @@ extern void drbd_suspend_io(struct drbd_conf *mdev);
 extern void drbd_resume_io(struct drbd_conf *mdev);
 extern char *ppsize(char *buf, unsigned long long size);
 extern sector_t drbd_new_dev_size(struct drbd_conf *, struct drbd_backing_dev *, sector_t, int);
-enum determine_dev_size { dev_size_error = -1, unchanged = 0, shrunk = 1, grew = 2 };
-extern enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *, enum dds_flags) __must_hold(local);
+enum determine_dev_size {
+	DS_ERROR_SHRINK = -3,
+	DS_ERROR_SPACE_MD = -2,
+	DS_ERROR = -1,
+	DS_UNCHANGED = 0,
+	DS_SHRUNK = 1,
+	DS_GREW = 2
+};
+extern enum determine_dev_size
+drbd_determine_dev_size(struct drbd_conf *, enum dds_flags, struct resize_parms *) __must_hold(local);
 extern void resync_after_online_grow(struct drbd_conf *);
 extern void drbd_reconsider_max_bio_size(struct drbd_conf *mdev);
 extern enum drbd_state_rv drbd_set_role(struct drbd_conf *mdev,
@@ -1633,6 +1643,7 @@ extern int __drbd_set_out_of_sync(struct drbd_conf *mdev, sector_t sector,
 #define drbd_set_out_of_sync(mdev, sector, size) \
 	__drbd_set_out_of_sync(mdev, sector, size, __FILE__, __LINE__)
 extern void drbd_al_shrink(struct drbd_conf *mdev);
+extern int drbd_initialize_al(struct drbd_conf *, void *);
 
 /* drbd_nl.c */
 /* state info broadcast */
diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c
index a5dca6affcbb..55635edf563b 100644
--- a/drivers/block/drbd/drbd_main.c
+++ b/drivers/block/drbd/drbd_main.c
@@ -2762,8 +2762,6 @@ int __init drbd_init(void)
 	/*
 	 * allocate all necessary structs
 	 */
-	err = -ENOMEM;
-
 	init_waitqueue_head(&drbd_pp_wait);
 
 	drbd_proc = NULL; /* play safe for drbd_cleanup */
@@ -2773,6 +2771,7 @@ int __init drbd_init(void)
 	if (err)
 		goto fail;
 
+	err = -ENOMEM;
 	drbd_proc = proc_create_data("drbd", S_IFREG | S_IRUGO , NULL, &drbd_proc_fops, NULL);
 	if (!drbd_proc)	{
 		printk(KERN_ERR "drbd: unable to register proc file\n");
@@ -2803,7 +2802,6 @@ int __init drbd_init(void)
 fail:
 	drbd_cleanup();
 	if (err == -ENOMEM)
-		/* currently always the case */
 		printk(KERN_ERR "drbd: ran out of memory\n");
 	else
 		printk(KERN_ERR "drbd: initialization failure\n");
@@ -2881,34 +2879,14 @@ struct meta_data_on_disk {
 	u8 reserved_u8[4096 - (7*8 + 10*4)];
 } __packed;
 
-/**
- * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
- * @mdev:	DRBD device.
- */
-void drbd_md_sync(struct drbd_conf *mdev)
+
+
+void drbd_md_write(struct drbd_conf *mdev, void *b)
 {
-	struct meta_data_on_disk *buffer;
+	struct meta_data_on_disk *buffer = b;
 	sector_t sector;
 	int i;
 
-	/* Don't accidentally change the DRBD meta data layout. */
-	BUILD_BUG_ON(UI_SIZE != 4);
-	BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
-
-	del_timer(&mdev->md_sync_timer);
-	/* timer may be rearmed by drbd_md_mark_dirty() now. */
-	if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
-		return;
-
-	/* We use here D_FAILED and not D_ATTACHING because we try to write
-	 * metadata even if we detach due to a disk failure! */
-	if (!get_ldev_if_state(mdev, D_FAILED))
-		return;
-
-	buffer = drbd_md_get_buffer(mdev);
-	if (!buffer)
-		goto out;
-
 	memset(buffer, 0, sizeof(*buffer));
 
 	buffer->la_size_sect = cpu_to_be64(drbd_get_capacity(mdev->this_bdev));
@@ -2937,6 +2915,35 @@ void drbd_md_sync(struct drbd_conf *mdev)
 		dev_err(DEV, "meta data update failed!\n");
 		drbd_chk_io_error(mdev, 1, DRBD_META_IO_ERROR);
 	}
+}
+
+/**
+ * drbd_md_sync() - Writes the meta data super block if the MD_DIRTY flag bit is set
+ * @mdev:	DRBD device.
+ */
+void drbd_md_sync(struct drbd_conf *mdev)
+{
+	struct meta_data_on_disk *buffer;
+
+	/* Don't accidentally change the DRBD meta data layout. */
+	BUILD_BUG_ON(UI_SIZE != 4);
+	BUILD_BUG_ON(sizeof(struct meta_data_on_disk) != 4096);
+
+	del_timer(&mdev->md_sync_timer);
+	/* timer may be rearmed by drbd_md_mark_dirty() now. */
+	if (!test_and_clear_bit(MD_DIRTY, &mdev->flags))
+		return;
+
+	/* We use here D_FAILED and not D_ATTACHING because we try to write
+	 * metadata even if we detach due to a disk failure! */
+	if (!get_ldev_if_state(mdev, D_FAILED))
+		return;
+
+	buffer = drbd_md_get_buffer(mdev);
+	if (!buffer)
+		goto out;
+
+	drbd_md_write(mdev, buffer);
 
 	/* Update mdev->ldev->md.la_size_sect,
 	 * since we updated it on metadata. */
diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c
index 9e3f441e7e84..8cc1e640f485 100644
--- a/drivers/block/drbd/drbd_nl.c
+++ b/drivers/block/drbd/drbd_nl.c
@@ -417,6 +417,7 @@ static enum drbd_fencing_p highest_fencing_policy(struct drbd_tconn *tconn)
 
 bool conn_try_outdate_peer(struct drbd_tconn *tconn)
 {
+	unsigned int connect_cnt;
 	union drbd_state mask = { };
 	union drbd_state val = { };
 	enum drbd_fencing_p fp;
@@ -428,6 +429,10 @@ bool conn_try_outdate_peer(struct drbd_tconn *tconn)
 		return false;
 	}
 
+	spin_lock_irq(&tconn->req_lock);
+	connect_cnt = tconn->connect_cnt;
+	spin_unlock_irq(&tconn->req_lock);
+
 	fp = highest_fencing_policy(tconn);
 	switch (fp) {
 	case FP_NOT_AVAIL:
@@ -492,8 +497,14 @@ bool conn_try_outdate_peer(struct drbd_tconn *tconn)
 	   here, because we might were able to re-establish the connection in the
 	   meantime. */
 	spin_lock_irq(&tconn->req_lock);
-	if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags))
-		_conn_request_state(tconn, mask, val, CS_VERBOSE);
+	if (tconn->cstate < C_WF_REPORT_PARAMS && !test_bit(STATE_SENT, &tconn->flags)) {
+		if (tconn->connect_cnt != connect_cnt)
+			/* In case the connection was established and droped
+			   while the fence-peer handler was running, ignore it */
+			conn_info(tconn, "Ignoring fence-peer exit code\n");
+		else
+			_conn_request_state(tconn, mask, val, CS_VERBOSE);
+	}
 	spin_unlock_irq(&tconn->req_lock);
 
 	return conn_highest_pdsk(tconn) <= D_OUTDATED;
@@ -816,15 +827,20 @@ void drbd_resume_io(struct drbd_conf *mdev)
  * Returns 0 on success, negative return values indicate errors.
  * You should call drbd_md_sync() after calling this function.
  */
-enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags) __must_hold(local)
+enum determine_dev_size
+drbd_determine_dev_size(struct drbd_conf *mdev, enum dds_flags flags, struct resize_parms *rs) __must_hold(local)
 {
 	sector_t prev_first_sect, prev_size; /* previous meta location */
 	sector_t la_size_sect, u_size;
+	struct drbd_md *md = &mdev->ldev->md;
+	u32 prev_al_stripe_size_4k;
+	u32 prev_al_stripes;
 	sector_t size;
 	char ppb[10];
+	void *buffer;
 
 	int md_moved, la_size_changed;
-	enum determine_dev_size rv = unchanged;
+	enum determine_dev_size rv = DS_UNCHANGED;
 
 	/* race:
 	 * application request passes inc_ap_bio,
@@ -836,6 +852,11 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
 	 * still lock the act_log to not trigger ASSERTs there.
 	 */
 	drbd_suspend_io(mdev);
+	buffer = drbd_md_get_buffer(mdev); /* Lock meta-data IO */
+	if (!buffer) {
+		drbd_resume_io(mdev);
+		return DS_ERROR;
+	}
 
 	/* no wait necessary anymore, actually we could assert that */
 	wait_event(mdev->al_wait, lc_try_lock(mdev->act_log));
@@ -844,7 +865,17 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
 	prev_size = mdev->ldev->md.md_size_sect;
 	la_size_sect = mdev->ldev->md.la_size_sect;
 
-	/* TODO: should only be some assert here, not (re)init... */
+	if (rs) {
+		/* rs is non NULL if we should change the AL layout only */
+
+		prev_al_stripes = md->al_stripes;
+		prev_al_stripe_size_4k = md->al_stripe_size_4k;
+
+		md->al_stripes = rs->al_stripes;
+		md->al_stripe_size_4k = rs->al_stripe_size / 4;
+		md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4;
+	}
+
 	drbd_md_set_sector_offsets(mdev, mdev->ldev);
 
 	rcu_read_lock();
@@ -852,6 +883,21 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
 	rcu_read_unlock();
 	size = drbd_new_dev_size(mdev, mdev->ldev, u_size, flags & DDSF_FORCED);
 
+	if (size < la_size_sect) {
+		if (rs && u_size == 0) {
+			/* Remove "rs &&" later. This check should always be active, but
+			   right now the receiver expects the permissive behavior */
+			dev_warn(DEV, "Implicit shrink not allowed. "
+				 "Use --size=%llus for explicit shrink.\n",
+				 (unsigned long long)size);
+			rv = DS_ERROR_SHRINK;
+		}
+		if (u_size > size)
+			rv = DS_ERROR_SPACE_MD;
+		if (rv != DS_UNCHANGED)
+			goto err_out;
+	}
+
 	if (drbd_get_capacity(mdev->this_bdev) != size ||
 	    drbd_bm_capacity(mdev) != size) {
 		int err;
@@ -867,7 +913,7 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
 				    "Leaving size unchanged at size = %lu KB\n",
 				    (unsigned long)size);
 			}
-			rv = dev_size_error;
+			rv = DS_ERROR;
 		}
 		/* racy, see comments above. */
 		drbd_set_my_capacity(mdev, size);
@@ -875,38 +921,57 @@ enum determine_dev_size drbd_determine_dev_size(struct drbd_conf *mdev, enum dds
 		dev_info(DEV, "size = %s (%llu KB)\n", ppsize(ppb, size>>1),
 		     (unsigned long long)size>>1);
 	}
-	if (rv == dev_size_error)
-		goto out;
+	if (rv <= DS_ERROR)
+		goto err_out;
 
 	la_size_changed = (la_size_sect != mdev->ldev->md.la_size_sect);
 
 	md_moved = prev_first_sect != drbd_md_first_sector(mdev->ldev)
 		|| prev_size	   != mdev->ldev->md.md_size_sect;
 
-	if (la_size_changed || md_moved) {
-		int err;
+	if (la_size_changed || md_moved || rs) {
+		u32 prev_flags;
 
 		drbd_al_shrink(mdev); /* All extents inactive. */
+
+		prev_flags = md->flags;
+		md->flags &= ~MDF_PRIMARY_IND;
+		drbd_md_write(mdev, buffer);
+
 		dev_info(DEV, "Writing the whole bitmap, %s\n",
 			 la_size_changed && md_moved ? "size changed and md moved" :
 			 la_size_changed ? "size changed" : "md moved");
 		/* next line implicitly does drbd_suspend_io()+drbd_resume_io() */
-		err = drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
-				     "size changed", BM_LOCKED_MASK);
-		if (err) {
-			rv = dev_size_error;
-			goto out;
-		}
-		drbd_md_mark_dirty(mdev);
+		drbd_bitmap_io(mdev, md_moved ? &drbd_bm_write_all : &drbd_bm_write,
+			       "size changed", BM_LOCKED_MASK);
+		drbd_initialize_al(mdev, buffer);
+
+		md->flags = prev_flags;
+		drbd_md_write(mdev, buffer);
+
+		if (rs)
+			dev_info(DEV, "Changed AL layout to al-stripes = %d, al-stripe-size-kB = %d\n",
+				 md->al_stripes, md->al_stripe_size_4k * 4);
 	}
 
 	if (size > la_size_sect)
-		rv = grew;
+		rv = DS_GREW;
 	if (size < la_size_sect)
-		rv = shrunk;
-out:
+		rv = DS_SHRUNK;
+
+	if (0) {
+	err_out:
+		if (rs) {
+			md->al_stripes = prev_al_stripes;
+			md->al_stripe_size_4k = prev_al_stripe_size_4k;
+			md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k;
+
+			drbd_md_set_sector_offsets(mdev, mdev->ldev);
+		}
+	}
 	lc_unlock(mdev->act_log);
 	wake_up(&mdev->al_wait);
+	drbd_md_put_buffer(mdev);
 	drbd_resume_io(mdev);
 
 	return rv;
@@ -1607,11 +1672,11 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info)
 	    !drbd_md_test_flag(mdev->ldev, MDF_CONNECTED_IND))
 		set_bit(USE_DEGR_WFC_T, &mdev->flags);
 
-	dd = drbd_determine_dev_size(mdev, 0);
-	if (dd == dev_size_error) {
+	dd = drbd_determine_dev_size(mdev, 0, NULL);
+	if (dd <= DS_ERROR) {
 		retcode = ERR_NOMEM_BITMAP;
 		goto force_diskless_dec;
-	} else if (dd == grew)
+	} else if (dd == DS_GREW)
 		set_bit(RESYNC_AFTER_NEG, &mdev->flags);
 
 	if (drbd_md_test_flag(mdev->ldev, MDF_FULL_SYNC) ||
@@ -2305,6 +2370,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
 	struct drbd_conf *mdev;
 	enum drbd_ret_code retcode;
 	enum determine_dev_size dd;
+	bool change_al_layout = false;
 	enum dds_flags ddsf;
 	sector_t u_size;
 	int err;
@@ -2315,31 +2381,33 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
 	if (retcode != NO_ERROR)
 		goto fail;
 
+	mdev = adm_ctx.mdev;
+	if (!get_ldev(mdev)) {
+		retcode = ERR_NO_DISK;
+		goto fail;
+	}
+
 	memset(&rs, 0, sizeof(struct resize_parms));
+	rs.al_stripes = mdev->ldev->md.al_stripes;
+	rs.al_stripe_size = mdev->ldev->md.al_stripe_size_4k * 4;
 	if (info->attrs[DRBD_NLA_RESIZE_PARMS]) {
 		err = resize_parms_from_attrs(&rs, info);
 		if (err) {
 			retcode = ERR_MANDATORY_TAG;
 			drbd_msg_put_info(from_attrs_err_to_txt(err));
-			goto fail;
+			goto fail_ldev;
 		}
 	}
 
-	mdev = adm_ctx.mdev;
 	if (mdev->state.conn > C_CONNECTED) {
 		retcode = ERR_RESIZE_RESYNC;
-		goto fail;
+		goto fail_ldev;
 	}
 
 	if (mdev->state.role == R_SECONDARY &&
 	    mdev->state.peer == R_SECONDARY) {
 		retcode = ERR_NO_PRIMARY;
-		goto fail;
-	}
-
-	if (!get_ldev(mdev)) {
-		retcode = ERR_NO_DISK;
-		goto fail;
+		goto fail_ldev;
 	}
 
 	if (rs.no_resync && mdev->tconn->agreed_pro_version < 93) {
@@ -2358,6 +2426,28 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
 		}
 	}
 
+	if (mdev->ldev->md.al_stripes != rs.al_stripes ||
+	    mdev->ldev->md.al_stripe_size_4k != rs.al_stripe_size / 4) {
+		u32 al_size_k = rs.al_stripes * rs.al_stripe_size;
+
+		if (al_size_k > (16 * 1024 * 1024)) {
+			retcode = ERR_MD_LAYOUT_TOO_BIG;
+			goto fail_ldev;
+		}
+
+		if (al_size_k < MD_32kB_SECT/2) {
+			retcode = ERR_MD_LAYOUT_TOO_SMALL;
+			goto fail_ldev;
+		}
+
+		if (mdev->state.conn != C_CONNECTED) {
+			retcode = ERR_MD_LAYOUT_CONNECTED;
+			goto fail_ldev;
+		}
+
+		change_al_layout = true;
+	}
+
 	if (mdev->ldev->known_size != drbd_get_capacity(mdev->ldev->backing_bdev))
 		mdev->ldev->known_size = drbd_get_capacity(mdev->ldev->backing_bdev);
 
@@ -2373,16 +2463,22 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info)
 	}
 
 	ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0);
-	dd = drbd_determine_dev_size(mdev, ddsf);
+	dd = drbd_determine_dev_size(mdev, ddsf, change_al_layout ? &rs : NULL);
 	drbd_md_sync(mdev);
 	put_ldev(mdev);
-	if (dd == dev_size_error) {
+	if (dd == DS_ERROR) {
 		retcode = ERR_NOMEM_BITMAP;
 		goto fail;
+	} else if (dd == DS_ERROR_SPACE_MD) {
+		retcode = ERR_MD_LAYOUT_NO_FIT;
+		goto fail;
+	} else if (dd == DS_ERROR_SHRINK) {
+		retcode = ERR_IMPLICIT_SHRINK;
+		goto fail;
 	}
 
 	if (mdev->state.conn == C_CONNECTED) {
-		if (dd == grew)
+		if (dd == DS_GREW)
 			set_bit(RESIZE_PENDING, &mdev->flags);
 
 		drbd_send_uuids(mdev);
@@ -2658,7 +2754,6 @@ int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev,
 		const struct sib_info *sib)
 {
 	struct state_info *si = NULL; /* for sizeof(si->member); */
-	struct net_conf *nc;
 	struct nlattr *nla;
 	int got_ldev;
 	int err = 0;
@@ -2688,13 +2783,19 @@ int nla_put_status_info(struct sk_buff *skb, struct drbd_conf *mdev,
 		goto nla_put_failure;
 
 	rcu_read_lock();
-	if (got_ldev)
-		if (disk_conf_to_skb(skb, rcu_dereference(mdev->ldev->disk_conf), exclude_sensitive))
-			goto nla_put_failure;
+	if (got_ldev) {
+		struct disk_conf *disk_conf;
 
-	nc = rcu_dereference(mdev->tconn->net_conf);
-	if (nc)
-		err = net_conf_to_skb(skb, nc, exclude_sensitive);
+		disk_conf = rcu_dereference(mdev->ldev->disk_conf);
+		err = disk_conf_to_skb(skb, disk_conf, exclude_sensitive);
+	}
+	if (!err) {
+		struct net_conf *nc;
+
+		nc = rcu_dereference(mdev->tconn->net_conf);
+		if (nc)
+			err = net_conf_to_skb(skb, nc, exclude_sensitive);
+	}
 	rcu_read_unlock();
 	if (err)
 		goto nla_put_failure;
diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c
index 4222affff488..cc29cd3bf78b 100644
--- a/drivers/block/drbd/drbd_receiver.c
+++ b/drivers/block/drbd/drbd_receiver.c
@@ -1039,6 +1039,8 @@ randomize:
 	rcu_read_lock();
 	idr_for_each_entry(&tconn->volumes, mdev, vnr) {
 		kref_get(&mdev->kref);
+		rcu_read_unlock();
+
 		/* Prevent a race between resync-handshake and
 		 * being promoted to Primary.
 		 *
@@ -1049,8 +1051,6 @@ randomize:
 		mutex_lock(mdev->state_mutex);
 		mutex_unlock(mdev->state_mutex);
 
-		rcu_read_unlock();
-
 		if (discard_my_data)
 			set_bit(DISCARD_MY_DATA, &mdev->flags);
 		else
@@ -3545,7 +3545,7 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi)
 {
 	struct drbd_conf *mdev;
 	struct p_sizes *p = pi->data;
-	enum determine_dev_size dd = unchanged;
+	enum determine_dev_size dd = DS_UNCHANGED;
 	sector_t p_size, p_usize, my_usize;
 	int ldsc = 0; /* local disk size changed */
 	enum dds_flags ddsf;
@@ -3617,9 +3617,9 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi)
 
 	ddsf = be16_to_cpu(p->dds_flags);
 	if (get_ldev(mdev)) {
-		dd = drbd_determine_dev_size(mdev, ddsf);
+		dd = drbd_determine_dev_size(mdev, ddsf, NULL);
 		put_ldev(mdev);
-		if (dd == dev_size_error)
+		if (dd == DS_ERROR)
 			return -EIO;
 		drbd_md_sync(mdev);
 	} else {
@@ -3647,7 +3647,7 @@ static int receive_sizes(struct drbd_tconn *tconn, struct packet_info *pi)
 			drbd_send_sizes(mdev, 0, ddsf);
 		}
 		if (test_and_clear_bit(RESIZE_PENDING, &mdev->flags) ||
-		    (dd == grew && mdev->state.conn == C_CONNECTED)) {
+		    (dd == DS_GREW && mdev->state.conn == C_CONNECTED)) {
 			if (mdev->state.pdsk >= D_INCONSISTENT &&
 			    mdev->state.disk >= D_INCONSISTENT) {
 				if (ddsf & DDSF_NO_RESYNC)
diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c
index 90c5be2b1d30..216d47b7e88b 100644
--- a/drivers/block/drbd/drbd_state.c
+++ b/drivers/block/drbd/drbd_state.c
@@ -1115,8 +1115,10 @@ __drbd_set_state(struct drbd_conf *mdev, union drbd_state ns,
 		drbd_thread_restart_nowait(&mdev->tconn->receiver);
 
 	/* Resume AL writing if we get a connection */
-	if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED)
+	if (os.conn < C_CONNECTED && ns.conn >= C_CONNECTED) {
 		drbd_resume_al(mdev);
+		mdev->tconn->connect_cnt++;
+	}
 
 	/* remember last attach time so request_timer_fn() won't
 	 * kill newly established sessions while we are still trying to thaw
diff --git a/drivers/block/rsxx/core.c b/drivers/block/rsxx/core.c
index 5af21f2db29c..6e85e21445eb 100644
--- a/drivers/block/rsxx/core.c
+++ b/drivers/block/rsxx/core.c
@@ -31,6 +31,8 @@
 #include <linux/slab.h>
 #include <linux/bitops.h>
 #include <linux/delay.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
 
 #include <linux/genhd.h>
 #include <linux/idr.h>
@@ -39,8 +41,9 @@
 #include "rsxx_cfg.h"
 
 #define NO_LEGACY 0
+#define SYNC_START_TIMEOUT (10 * 60) /* 10 minutes */
 
-MODULE_DESCRIPTION("IBM FlashSystem 70/80 PCIe SSD Device Driver");
+MODULE_DESCRIPTION("IBM Flash Adapter 900GB Full Height Device Driver");
 MODULE_AUTHOR("Joshua Morris/Philip Kelleher, IBM");
 MODULE_LICENSE("GPL");
 MODULE_VERSION(DRIVER_VERSION);
@@ -49,9 +52,282 @@ static unsigned int force_legacy = NO_LEGACY;
 module_param(force_legacy, uint, 0444);
 MODULE_PARM_DESC(force_legacy, "Force the use of legacy type PCI interrupts");
 
+static unsigned int sync_start = 1;
+module_param(sync_start, uint, 0444);
+MODULE_PARM_DESC(sync_start, "On by Default: Driver load will not complete "
+			     "until the card startup has completed.");
+
 static DEFINE_IDA(rsxx_disk_ida);
 static DEFINE_SPINLOCK(rsxx_ida_lock);
 
+/* --------------------Debugfs Setup ------------------- */
+
+struct rsxx_cram {
+	u32 f_pos;
+	u32 offset;
+	void *i_private;
+};
+
+static int rsxx_attr_pci_regs_show(struct seq_file *m, void *p)
+{
+	struct rsxx_cardinfo *card = m->private;
+
+	seq_printf(m, "HWID		0x%08x\n",
+					ioread32(card->regmap + HWID));
+	seq_printf(m, "SCRATCH		0x%08x\n",
+					ioread32(card->regmap + SCRATCH));
+	seq_printf(m, "IER		0x%08x\n",
+					ioread32(card->regmap + IER));
+	seq_printf(m, "IPR		0x%08x\n",
+					ioread32(card->regmap + IPR));
+	seq_printf(m, "CREG_CMD		0x%08x\n",
+					ioread32(card->regmap + CREG_CMD));
+	seq_printf(m, "CREG_ADD		0x%08x\n",
+					ioread32(card->regmap + CREG_ADD));
+	seq_printf(m, "CREG_CNT		0x%08x\n",
+					ioread32(card->regmap + CREG_CNT));
+	seq_printf(m, "CREG_STAT	0x%08x\n",
+					ioread32(card->regmap + CREG_STAT));
+	seq_printf(m, "CREG_DATA0	0x%08x\n",
+					ioread32(card->regmap + CREG_DATA0));
+	seq_printf(m, "CREG_DATA1	0x%08x\n",
+					ioread32(card->regmap + CREG_DATA1));
+	seq_printf(m, "CREG_DATA2	0x%08x\n",
+					ioread32(card->regmap + CREG_DATA2));
+	seq_printf(m, "CREG_DATA3	0x%08x\n",
+					ioread32(card->regmap + CREG_DATA3));
+	seq_printf(m, "CREG_DATA4	0x%08x\n",
+					ioread32(card->regmap + CREG_DATA4));
+	seq_printf(m, "CREG_DATA5	0x%08x\n",
+					ioread32(card->regmap + CREG_DATA5));
+	seq_printf(m, "CREG_DATA6	0x%08x\n",
+					ioread32(card->regmap + CREG_DATA6));
+	seq_printf(m, "CREG_DATA7	0x%08x\n",
+					ioread32(card->regmap + CREG_DATA7));
+	seq_printf(m, "INTR_COAL	0x%08x\n",
+					ioread32(card->regmap + INTR_COAL));
+	seq_printf(m, "HW_ERROR		0x%08x\n",
+					ioread32(card->regmap + HW_ERROR));
+	seq_printf(m, "DEBUG0		0x%08x\n",
+					ioread32(card->regmap + PCI_DEBUG0));
+	seq_printf(m, "DEBUG1		0x%08x\n",
+					ioread32(card->regmap + PCI_DEBUG1));
+	seq_printf(m, "DEBUG2		0x%08x\n",
+					ioread32(card->regmap + PCI_DEBUG2));
+	seq_printf(m, "DEBUG3		0x%08x\n",
+					ioread32(card->regmap + PCI_DEBUG3));
+	seq_printf(m, "DEBUG4		0x%08x\n",
+					ioread32(card->regmap + PCI_DEBUG4));
+	seq_printf(m, "DEBUG5		0x%08x\n",
+					ioread32(card->regmap + PCI_DEBUG5));
+	seq_printf(m, "DEBUG6		0x%08x\n",
+					ioread32(card->regmap + PCI_DEBUG6));
+	seq_printf(m, "DEBUG7		0x%08x\n",
+					ioread32(card->regmap + PCI_DEBUG7));
+	seq_printf(m, "RECONFIG		0x%08x\n",
+					ioread32(card->regmap + PCI_RECONFIG));
+
+	return 0;
+}
+
+static int rsxx_attr_stats_show(struct seq_file *m, void *p)
+{
+	struct rsxx_cardinfo *card = m->private;
+	int i;
+
+	for (i = 0; i < card->n_targets; i++) {
+		seq_printf(m, "Ctrl %d CRC Errors	= %d\n",
+				i, card->ctrl[i].stats.crc_errors);
+		seq_printf(m, "Ctrl %d Hard Errors	= %d\n",
+				i, card->ctrl[i].stats.hard_errors);
+		seq_printf(m, "Ctrl %d Soft Errors	= %d\n",
+				i, card->ctrl[i].stats.soft_errors);
+		seq_printf(m, "Ctrl %d Writes Issued	= %d\n",
+				i, card->ctrl[i].stats.writes_issued);
+		seq_printf(m, "Ctrl %d Writes Failed	= %d\n",
+				i, card->ctrl[i].stats.writes_failed);
+		seq_printf(m, "Ctrl %d Reads Issued	= %d\n",
+				i, card->ctrl[i].stats.reads_issued);
+		seq_printf(m, "Ctrl %d Reads Failed	= %d\n",
+				i, card->ctrl[i].stats.reads_failed);
+		seq_printf(m, "Ctrl %d Reads Retried	= %d\n",
+				i, card->ctrl[i].stats.reads_retried);
+		seq_printf(m, "Ctrl %d Discards Issued	= %d\n",
+				i, card->ctrl[i].stats.discards_issued);
+		seq_printf(m, "Ctrl %d Discards Failed	= %d\n",
+				i, card->ctrl[i].stats.discards_failed);
+		seq_printf(m, "Ctrl %d DMA SW Errors	= %d\n",
+				i, card->ctrl[i].stats.dma_sw_err);
+		seq_printf(m, "Ctrl %d DMA HW Faults	= %d\n",
+				i, card->ctrl[i].stats.dma_hw_fault);
+		seq_printf(m, "Ctrl %d DMAs Cancelled	= %d\n",
+				i, card->ctrl[i].stats.dma_cancelled);
+		seq_printf(m, "Ctrl %d SW Queue Depth	= %d\n",
+				i, card->ctrl[i].stats.sw_q_depth);
+		seq_printf(m, "Ctrl %d HW Queue Depth	= %d\n",
+			i, atomic_read(&card->ctrl[i].stats.hw_q_depth));
+	}
+
+	return 0;
+}
+
+static int rsxx_attr_stats_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, rsxx_attr_stats_show, inode->i_private);
+}
+
+static int rsxx_attr_pci_regs_open(struct inode *inode, struct file *file)
+{
+	return single_open(file, rsxx_attr_pci_regs_show, inode->i_private);
+}
+
+static ssize_t rsxx_cram_read(struct file *fp, char __user *ubuf,
+			      size_t cnt, loff_t *ppos)
+{
+	struct rsxx_cram *info = fp->private_data;
+	struct rsxx_cardinfo *card = info->i_private;
+	char *buf;
+	int st;
+
+	buf = kzalloc(sizeof(*buf) * cnt, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	info->f_pos = (u32)*ppos + info->offset;
+
+	st = rsxx_creg_read(card, CREG_ADD_CRAM + info->f_pos, cnt, buf, 1);
+	if (st)
+		return st;
+
+	st = copy_to_user(ubuf, buf, cnt);
+	if (st)
+		return st;
+
+	info->offset += cnt;
+
+	kfree(buf);
+
+	return cnt;
+}
+
+static ssize_t rsxx_cram_write(struct file *fp, const char __user *ubuf,
+			       size_t cnt, loff_t *ppos)
+{
+	struct rsxx_cram *info = fp->private_data;
+	struct rsxx_cardinfo *card = info->i_private;
+	char *buf;
+	int st;
+
+	buf = kzalloc(sizeof(*buf) * cnt, GFP_KERNEL);
+	if (!buf)
+		return -ENOMEM;
+
+	st = copy_from_user(buf, ubuf, cnt);
+	if (st)
+		return st;
+
+	info->f_pos = (u32)*ppos + info->offset;
+
+	st = rsxx_creg_write(card, CREG_ADD_CRAM + info->f_pos, cnt, buf, 1);
+	if (st)
+		return st;
+
+	info->offset += cnt;
+
+	kfree(buf);
+
+	return cnt;
+}
+
+static int rsxx_cram_open(struct inode *inode, struct file *file)
+{
+	struct rsxx_cram *info = kzalloc(sizeof(*info), GFP_KERNEL);
+	if (!info)
+		return -ENOMEM;
+
+	info->i_private = inode->i_private;
+	info->f_pos = file->f_pos;
+	file->private_data = info;
+
+	return 0;
+}
+
+static int rsxx_cram_release(struct inode *inode, struct file *file)
+{
+	struct rsxx_cram *info = file->private_data;
+
+	if (!info)
+		return 0;
+
+	kfree(info);
+	file->private_data = NULL;
+
+	return 0;
+}
+
+static const struct file_operations debugfs_cram_fops = {
+	.owner		= THIS_MODULE,
+	.open		= rsxx_cram_open,
+	.read		= rsxx_cram_read,
+	.write		= rsxx_cram_write,
+	.release	= rsxx_cram_release,
+};
+
+static const struct file_operations debugfs_stats_fops = {
+	.owner		= THIS_MODULE,
+	.open		= rsxx_attr_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static const struct file_operations debugfs_pci_regs_fops = {
+	.owner		= THIS_MODULE,
+	.open		= rsxx_attr_pci_regs_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static void rsxx_debugfs_dev_new(struct rsxx_cardinfo *card)
+{
+	struct dentry *debugfs_stats;
+	struct dentry *debugfs_pci_regs;
+	struct dentry *debugfs_cram;
+
+	card->debugfs_dir = debugfs_create_dir(card->gendisk->disk_name, NULL);
+	if (IS_ERR_OR_NULL(card->debugfs_dir))
+		goto failed_debugfs_dir;
+
+	debugfs_stats = debugfs_create_file("stats", S_IRUGO,
+					    card->debugfs_dir, card,
+					    &debugfs_stats_fops);
+	if (IS_ERR_OR_NULL(debugfs_stats))
+		goto failed_debugfs_stats;
+
+	debugfs_pci_regs = debugfs_create_file("pci_regs", S_IRUGO,
+					       card->debugfs_dir, card,
+					       &debugfs_pci_regs_fops);
+	if (IS_ERR_OR_NULL(debugfs_pci_regs))
+		goto failed_debugfs_pci_regs;
+
+	debugfs_cram = debugfs_create_file("cram", S_IRUGO | S_IWUSR,
+					   card->debugfs_dir, card,
+					   &debugfs_cram_fops);
+	if (IS_ERR_OR_NULL(debugfs_cram))
+		goto failed_debugfs_cram;
+
+	return;
+failed_debugfs_cram:
+	debugfs_remove(debugfs_pci_regs);
+failed_debugfs_pci_regs:
+	debugfs_remove(debugfs_stats);
+failed_debugfs_stats:
+	debugfs_remove(card->debugfs_dir);
+failed_debugfs_dir:
+	card->debugfs_dir = NULL;
+}
+
 /*----------------- Interrupt Control & Handling -------------------*/
 
 static void rsxx_mask_interrupts(struct rsxx_cardinfo *card)
@@ -163,12 +439,13 @@ static irqreturn_t rsxx_isr(int irq, void *pdata)
 		}
 
 		if (isr & CR_INTR_CREG) {
-			schedule_work(&card->creg_ctrl.done_work);
+			queue_work(card->creg_ctrl.creg_wq,
+				   &card->creg_ctrl.done_work);
 			handled++;
 		}
 
 		if (isr & CR_INTR_EVENT) {
-			schedule_work(&card->event_work);
+			queue_work(card->event_wq, &card->event_work);
 			rsxx_disable_ier_and_isr(card, CR_INTR_EVENT);
 			handled++;
 		}
@@ -329,7 +606,7 @@ static int rsxx_eeh_frozen(struct pci_dev *dev)
 	int i;
 	int st;
 
-	dev_warn(&dev->dev, "IBM FlashSystem PCI: preparing for slot reset.\n");
+	dev_warn(&dev->dev, "IBM Flash Adapter PCI: preparing for slot reset.\n");
 
 	card->eeh_state = 1;
 	rsxx_mask_interrupts(card);
@@ -367,15 +644,26 @@ static void rsxx_eeh_failure(struct pci_dev *dev)
 {
 	struct rsxx_cardinfo *card = pci_get_drvdata(dev);
 	int i;
+	int cnt = 0;
 
-	dev_err(&dev->dev, "IBM FlashSystem PCI: disabling failed card.\n");
+	dev_err(&dev->dev, "IBM Flash Adapter PCI: disabling failed card.\n");
 
 	card->eeh_state = 1;
+	card->halt = 1;
 
-	for (i = 0; i < card->n_targets; i++)
-		del_timer_sync(&card->ctrl[i].activity_timer);
+	for (i = 0; i < card->n_targets; i++) {
+		spin_lock_bh(&card->ctrl[i].queue_lock);
+		cnt = rsxx_cleanup_dma_queue(&card->ctrl[i],
+					     &card->ctrl[i].queue);
+		spin_unlock_bh(&card->ctrl[i].queue_lock);
+
+		cnt += rsxx_dma_cancel(&card->ctrl[i]);
 
-	rsxx_eeh_cancel_dmas(card);
+		if (cnt)
+			dev_info(CARD_TO_DEV(card),
+				"Freed %d queued DMAs on channel %d\n",
+				cnt, card->ctrl[i].id);
+	}
 }
 
 static int rsxx_eeh_fifo_flush_poll(struct rsxx_cardinfo *card)
@@ -432,7 +720,7 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev)
 	int st;
 
 	dev_warn(&dev->dev,
-		"IBM FlashSystem PCI: recovering from slot reset.\n");
+		"IBM Flash Adapter PCI: recovering from slot reset.\n");
 
 	st = pci_enable_device(dev);
 	if (st)
@@ -485,7 +773,7 @@ static pci_ers_result_t rsxx_slot_reset(struct pci_dev *dev)
 				&card->ctrl[i].issue_dma_work);
 	}
 
-	dev_info(&dev->dev, "IBM FlashSystem PCI: recovery complete.\n");
+	dev_info(&dev->dev, "IBM Flash Adapter PCI: recovery complete.\n");
 
 	return PCI_ERS_RESULT_RECOVERED;
 
@@ -528,6 +816,7 @@ static int rsxx_pci_probe(struct pci_dev *dev,
 {
 	struct rsxx_cardinfo *card;
 	int st;
+	unsigned int sync_timeout;
 
 	dev_info(&dev->dev, "PCI-Flash SSD discovered\n");
 
@@ -610,7 +899,11 @@ static int rsxx_pci_probe(struct pci_dev *dev,
 	}
 
 	/************* Setup Processor Command Interface *************/
-	rsxx_creg_setup(card);
+	st = rsxx_creg_setup(card);
+	if (st) {
+		dev_err(CARD_TO_DEV(card), "Failed to setup creg interface.\n");
+		goto failed_creg_setup;
+	}
 
 	spin_lock_irq(&card->irq_lock);
 	rsxx_enable_ier_and_isr(card, CR_INTR_CREG);
@@ -650,6 +943,12 @@ static int rsxx_pci_probe(struct pci_dev *dev,
 	}
 
 	/************* Setup Card Event Handler *************/
+	card->event_wq = create_singlethread_workqueue(DRIVER_NAME"_event");
+	if (!card->event_wq) {
+		dev_err(CARD_TO_DEV(card), "Failed card event setup.\n");
+		goto failed_event_handler;
+	}
+
 	INIT_WORK(&card->event_work, card_event_handler);
 
 	st = rsxx_setup_dev(card);
@@ -676,6 +975,33 @@ static int rsxx_pci_probe(struct pci_dev *dev,
 		if (st)
 			dev_crit(CARD_TO_DEV(card),
 				"Failed issuing card startup\n");
+		if (sync_start) {
+			sync_timeout = SYNC_START_TIMEOUT;
+
+			dev_info(CARD_TO_DEV(card),
+				 "Waiting for card to startup\n");
+
+			do {
+				ssleep(1);
+				sync_timeout--;
+
+				rsxx_get_card_state(card, &card->state);
+			} while (sync_timeout &&
+				(card->state == CARD_STATE_STARTING));
+
+			if (card->state == CARD_STATE_STARTING) {
+				dev_warn(CARD_TO_DEV(card),
+					 "Card startup timed out\n");
+				card->size8 = 0;
+			} else {
+				dev_info(CARD_TO_DEV(card),
+					"card state: %s\n",
+					rsxx_card_state_to_str(card->state));
+				st = rsxx_get_card_size8(card, &card->size8);
+				if (st)
+					card->size8 = 0;
+			}
+		}
 	} else if (card->state == CARD_STATE_GOOD ||
 		   card->state == CARD_STATE_RD_ONLY_FAULT) {
 		st = rsxx_get_card_size8(card, &card->size8);
@@ -685,12 +1011,21 @@ static int rsxx_pci_probe(struct pci_dev *dev,
 
 	rsxx_attach_dev(card);
 
+	/************* Setup Debugfs *************/
+	rsxx_debugfs_dev_new(card);
+
 	return 0;
 
 failed_create_dev:
+	destroy_workqueue(card->event_wq);
+	card->event_wq = NULL;
+failed_event_handler:
 	rsxx_dma_destroy(card);
 failed_dma_setup:
 failed_compatiblity_check:
+	destroy_workqueue(card->creg_ctrl.creg_wq);
+	card->creg_ctrl.creg_wq = NULL;
+failed_creg_setup:
 	spin_lock_irq(&card->irq_lock);
 	rsxx_disable_ier_and_isr(card, CR_INTR_ALL);
 	spin_unlock_irq(&card->irq_lock);
@@ -756,6 +1091,8 @@ static void rsxx_pci_remove(struct pci_dev *dev)
 	/* Prevent work_structs from re-queuing themselves. */
 	card->halt = 1;
 
+	debugfs_remove_recursive(card->debugfs_dir);
+
 	free_irq(dev->irq, card);
 
 	if (!force_legacy)
diff --git a/drivers/block/rsxx/cregs.c b/drivers/block/rsxx/cregs.c
index 4b5c020a0a65..926dce9c452f 100644
--- a/drivers/block/rsxx/cregs.c
+++ b/drivers/block/rsxx/cregs.c
@@ -431,6 +431,15 @@ static int __issue_creg_rw(struct rsxx_cardinfo *card,
 	*hw_stat = completion.creg_status;
 
 	if (completion.st) {
+		/*
+		* This read is needed to verify that there has not been any
+		* extreme errors that might have occurred, i.e. EEH. The
+		* function iowrite32 will not detect EEH errors, so it is
+		* necessary that we recover if such an error is the reason
+		* for the timeout. This is a dummy read.
+		*/
+		ioread32(card->regmap + SCRATCH);
+
 		dev_warn(CARD_TO_DEV(card),
 			"creg command failed(%d x%08x)\n",
 			completion.st, addr);
@@ -727,6 +736,11 @@ int rsxx_creg_setup(struct rsxx_cardinfo *card)
 {
 	card->creg_ctrl.active_cmd = NULL;
 
+	card->creg_ctrl.creg_wq =
+			create_singlethread_workqueue(DRIVER_NAME"_creg");
+	if (!card->creg_ctrl.creg_wq)
+		return -ENOMEM;
+
 	INIT_WORK(&card->creg_ctrl.done_work, creg_cmd_done);
 	mutex_init(&card->creg_ctrl.reset_lock);
 	INIT_LIST_HEAD(&card->creg_ctrl.queue);
diff --git a/drivers/block/rsxx/dev.c b/drivers/block/rsxx/dev.c
index 4346d17d2949..d7af441880be 100644
--- a/drivers/block/rsxx/dev.c
+++ b/drivers/block/rsxx/dev.c
@@ -155,7 +155,8 @@ static void bio_dma_done_cb(struct rsxx_cardinfo *card,
 		atomic_set(&meta->error, 1);
 
 	if (atomic_dec_and_test(&meta->pending_dmas)) {
-		disk_stats_complete(card, meta->bio, meta->start_time);
+		if (!card->eeh_state && card->gendisk)
+			disk_stats_complete(card, meta->bio, meta->start_time);
 
 		bio_endio(meta->bio, atomic_read(&meta->error) ? -EIO : 0);
 		kmem_cache_free(bio_meta_pool, meta);
@@ -170,6 +171,12 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio)
 
 	might_sleep();
 
+	if (!card)
+		goto req_err;
+
+	if (bio->bi_sector + (bio->bi_size >> 9) > get_capacity(card->gendisk))
+		goto req_err;
+
 	if (unlikely(card->halt)) {
 		st = -EFAULT;
 		goto req_err;
@@ -196,7 +203,8 @@ static void rsxx_make_request(struct request_queue *q, struct bio *bio)
 	atomic_set(&bio_meta->pending_dmas, 0);
 	bio_meta->start_time = jiffies;
 
-	disk_stats_start(card, bio);
+	if (!unlikely(card->halt))
+		disk_stats_start(card, bio);
 
 	dev_dbg(CARD_TO_DEV(card), "BIO[%c]: meta: %p addr8: x%llx size: %d\n",
 		 bio_data_dir(bio) ? 'W' : 'R', bio_meta,
@@ -225,24 +233,6 @@ static bool rsxx_discard_supported(struct rsxx_cardinfo *card)
 	return (pci_rev >= RSXX_DISCARD_SUPPORT);
 }
 
-static unsigned short rsxx_get_logical_block_size(
-					struct rsxx_cardinfo *card)
-{
-	u32 capabilities = 0;
-	int st;
-
-	st = rsxx_get_card_capabilities(card, &capabilities);
-	if (st)
-		dev_warn(CARD_TO_DEV(card),
-			"Failed reading card capabilities register\n");
-
-	/* Earlier firmware did not have support for 512 byte accesses */
-	if (capabilities & CARD_CAP_SUBPAGE_WRITES)
-		return 512;
-	else
-		return RSXX_HW_BLK_SIZE;
-}
-
 int rsxx_attach_dev(struct rsxx_cardinfo *card)
 {
 	mutex_lock(&card->dev_lock);
@@ -305,7 +295,7 @@ int rsxx_setup_dev(struct rsxx_cardinfo *card)
 		return -ENOMEM;
 	}
 
-	blk_size = rsxx_get_logical_block_size(card);
+	blk_size = card->config.data.block_size;
 
 	blk_queue_make_request(card->queue, rsxx_make_request);
 	blk_queue_bounce_limit(card->queue, BLK_BOUNCE_ANY);
@@ -347,6 +337,7 @@ void rsxx_destroy_dev(struct rsxx_cardinfo *card)
 	card->gendisk = NULL;
 
 	blk_cleanup_queue(card->queue);
+	card->queue->queuedata = NULL;
 	unregister_blkdev(card->major, DRIVER_NAME);
 }
 
diff --git a/drivers/block/rsxx/dma.c b/drivers/block/rsxx/dma.c
index 0607513cfb41..bed32f16b084 100644
--- a/drivers/block/rsxx/dma.c
+++ b/drivers/block/rsxx/dma.c
@@ -245,6 +245,22 @@ static void rsxx_complete_dma(struct rsxx_dma_ctrl *ctrl,
 	kmem_cache_free(rsxx_dma_pool, dma);
 }
 
+int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl,
+			   struct list_head *q)
+{
+	struct rsxx_dma *dma;
+	struct rsxx_dma *tmp;
+	int cnt = 0;
+
+	list_for_each_entry_safe(dma, tmp, q, list) {
+		list_del(&dma->list);
+		rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
+		cnt++;
+	}
+
+	return cnt;
+}
+
 static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl,
 				 struct rsxx_dma *dma)
 {
@@ -252,9 +268,10 @@ static void rsxx_requeue_dma(struct rsxx_dma_ctrl *ctrl,
 	 * Requeued DMAs go to the front of the queue so they are issued
 	 * first.
 	 */
-	spin_lock(&ctrl->queue_lock);
+	spin_lock_bh(&ctrl->queue_lock);
+	ctrl->stats.sw_q_depth++;
 	list_add(&dma->list, &ctrl->queue);
-	spin_unlock(&ctrl->queue_lock);
+	spin_unlock_bh(&ctrl->queue_lock);
 }
 
 static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl,
@@ -329,6 +346,7 @@ static void rsxx_handle_dma_error(struct rsxx_dma_ctrl *ctrl,
 static void dma_engine_stalled(unsigned long data)
 {
 	struct rsxx_dma_ctrl *ctrl = (struct rsxx_dma_ctrl *)data;
+	int cnt;
 
 	if (atomic_read(&ctrl->stats.hw_q_depth) == 0 ||
 	    unlikely(ctrl->card->eeh_state))
@@ -349,18 +367,28 @@ static void dma_engine_stalled(unsigned long data)
 			"DMA channel %d has stalled, faulting interface.\n",
 			ctrl->id);
 		ctrl->card->dma_fault = 1;
+
+		/* Clean up the DMA queue */
+		spin_lock(&ctrl->queue_lock);
+		cnt = rsxx_cleanup_dma_queue(ctrl, &ctrl->queue);
+		spin_unlock(&ctrl->queue_lock);
+
+		cnt += rsxx_dma_cancel(ctrl);
+
+		if (cnt)
+			dev_info(CARD_TO_DEV(ctrl->card),
+				"Freed %d queued DMAs on channel %d\n",
+				cnt, ctrl->id);
 	}
 }
 
-static void rsxx_issue_dmas(struct work_struct *work)
+static void rsxx_issue_dmas(struct rsxx_dma_ctrl *ctrl)
 {
-	struct rsxx_dma_ctrl *ctrl;
 	struct rsxx_dma *dma;
 	int tag;
 	int cmds_pending = 0;
 	struct hw_cmd *hw_cmd_buf;
 
-	ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work);
 	hw_cmd_buf = ctrl->cmd.buf;
 
 	if (unlikely(ctrl->card->halt) ||
@@ -368,22 +396,22 @@ static void rsxx_issue_dmas(struct work_struct *work)
 		return;
 
 	while (1) {
-		spin_lock(&ctrl->queue_lock);
+		spin_lock_bh(&ctrl->queue_lock);
 		if (list_empty(&ctrl->queue)) {
-			spin_unlock(&ctrl->queue_lock);
+			spin_unlock_bh(&ctrl->queue_lock);
 			break;
 		}
-		spin_unlock(&ctrl->queue_lock);
+		spin_unlock_bh(&ctrl->queue_lock);
 
 		tag = pop_tracker(ctrl->trackers);
 		if (tag == -1)
 			break;
 
-		spin_lock(&ctrl->queue_lock);
+		spin_lock_bh(&ctrl->queue_lock);
 		dma = list_entry(ctrl->queue.next, struct rsxx_dma, list);
 		list_del(&dma->list);
 		ctrl->stats.sw_q_depth--;
-		spin_unlock(&ctrl->queue_lock);
+		spin_unlock_bh(&ctrl->queue_lock);
 
 		/*
 		 * This will catch any DMAs that slipped in right before the
@@ -440,9 +468,8 @@ static void rsxx_issue_dmas(struct work_struct *work)
 	}
 }
 
-static void rsxx_dma_done(struct work_struct *work)
+static void rsxx_dma_done(struct rsxx_dma_ctrl *ctrl)
 {
-	struct rsxx_dma_ctrl *ctrl;
 	struct rsxx_dma *dma;
 	unsigned long flags;
 	u16 count;
@@ -450,7 +477,6 @@ static void rsxx_dma_done(struct work_struct *work)
 	u8 tag;
 	struct hw_status *hw_st_buf;
 
-	ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work);
 	hw_st_buf = ctrl->status.buf;
 
 	if (unlikely(ctrl->card->halt) ||
@@ -520,33 +546,32 @@ static void rsxx_dma_done(struct work_struct *work)
 	rsxx_enable_ier(ctrl->card, CR_INTR_DMA(ctrl->id));
 	spin_unlock_irqrestore(&ctrl->card->irq_lock, flags);
 
-	spin_lock(&ctrl->queue_lock);
+	spin_lock_bh(&ctrl->queue_lock);
 	if (ctrl->stats.sw_q_depth)
 		queue_work(ctrl->issue_wq, &ctrl->issue_dma_work);
-	spin_unlock(&ctrl->queue_lock);
+	spin_unlock_bh(&ctrl->queue_lock);
 }
 
-static int rsxx_cleanup_dma_queue(struct rsxx_cardinfo *card,
-				      struct list_head *q)
+static void rsxx_schedule_issue(struct work_struct *work)
 {
-	struct rsxx_dma *dma;
-	struct rsxx_dma *tmp;
-	int cnt = 0;
+	struct rsxx_dma_ctrl *ctrl;
 
-	list_for_each_entry_safe(dma, tmp, q, list) {
-		list_del(&dma->list);
+	ctrl = container_of(work, struct rsxx_dma_ctrl, issue_dma_work);
 
-		if (dma->dma_addr)
-			pci_unmap_page(card->dev, dma->dma_addr,
-				       get_dma_size(dma),
-				       (dma->cmd == HW_CMD_BLK_WRITE) ?
-				       PCI_DMA_TODEVICE :
-				       PCI_DMA_FROMDEVICE);
-		kmem_cache_free(rsxx_dma_pool, dma);
-		cnt++;
-	}
+	mutex_lock(&ctrl->work_lock);
+	rsxx_issue_dmas(ctrl);
+	mutex_unlock(&ctrl->work_lock);
+}
 
-	return cnt;
+static void rsxx_schedule_done(struct work_struct *work)
+{
+	struct rsxx_dma_ctrl *ctrl;
+
+	ctrl = container_of(work, struct rsxx_dma_ctrl, dma_done_work);
+
+	mutex_lock(&ctrl->work_lock);
+	rsxx_dma_done(ctrl);
+	mutex_unlock(&ctrl->work_lock);
 }
 
 static int rsxx_queue_discard(struct rsxx_cardinfo *card,
@@ -698,10 +723,10 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
 
 	for (i = 0; i < card->n_targets; i++) {
 		if (!list_empty(&dma_list[i])) {
-			spin_lock(&card->ctrl[i].queue_lock);
+			spin_lock_bh(&card->ctrl[i].queue_lock);
 			card->ctrl[i].stats.sw_q_depth += dma_cnt[i];
 			list_splice_tail(&dma_list[i], &card->ctrl[i].queue);
-			spin_unlock(&card->ctrl[i].queue_lock);
+			spin_unlock_bh(&card->ctrl[i].queue_lock);
 
 			queue_work(card->ctrl[i].issue_wq,
 				   &card->ctrl[i].issue_dma_work);
@@ -711,8 +736,11 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
 	return 0;
 
 bvec_err:
-	for (i = 0; i < card->n_targets; i++)
-		rsxx_cleanup_dma_queue(card, &dma_list[i]);
+	for (i = 0; i < card->n_targets; i++) {
+		spin_lock_bh(&card->ctrl[i].queue_lock);
+		rsxx_cleanup_dma_queue(&card->ctrl[i], &dma_list[i]);
+		spin_unlock_bh(&card->ctrl[i].queue_lock);
+	}
 
 	return st;
 }
@@ -780,6 +808,7 @@ static int rsxx_dma_ctrl_init(struct pci_dev *dev,
 	spin_lock_init(&ctrl->trackers->lock);
 
 	spin_lock_init(&ctrl->queue_lock);
+	mutex_init(&ctrl->work_lock);
 	INIT_LIST_HEAD(&ctrl->queue);
 
 	setup_timer(&ctrl->activity_timer, dma_engine_stalled,
@@ -793,8 +822,8 @@ static int rsxx_dma_ctrl_init(struct pci_dev *dev,
 	if (!ctrl->done_wq)
 		return -ENOMEM;
 
-	INIT_WORK(&ctrl->issue_dma_work, rsxx_issue_dmas);
-	INIT_WORK(&ctrl->dma_done_work, rsxx_dma_done);
+	INIT_WORK(&ctrl->issue_dma_work, rsxx_schedule_issue);
+	INIT_WORK(&ctrl->dma_done_work, rsxx_schedule_done);
 
 	st = rsxx_hw_buffers_init(dev, ctrl);
 	if (st)
@@ -918,13 +947,30 @@ failed_dma_setup:
 	return st;
 }
 
+int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl)
+{
+	struct rsxx_dma *dma;
+	int i;
+	int cnt = 0;
+
+	/* Clean up issued DMAs */
+	for (i = 0; i < RSXX_MAX_OUTSTANDING_CMDS; i++) {
+		dma = get_tracker_dma(ctrl->trackers, i);
+		if (dma) {
+			atomic_dec(&ctrl->stats.hw_q_depth);
+			rsxx_complete_dma(ctrl, dma, DMA_CANCELLED);
+			push_tracker(ctrl->trackers, i);
+			cnt++;
+		}
+	}
+
+	return cnt;
+}
 
 void rsxx_dma_destroy(struct rsxx_cardinfo *card)
 {
 	struct rsxx_dma_ctrl *ctrl;
-	struct rsxx_dma *dma;
-	int i, j;
-	int cnt = 0;
+	int i;
 
 	for (i = 0; i < card->n_targets; i++) {
 		ctrl = &card->ctrl[i];
@@ -943,33 +989,11 @@ void rsxx_dma_destroy(struct rsxx_cardinfo *card)
 			del_timer_sync(&ctrl->activity_timer);
 
 		/* Clean up the DMA queue */
-		spin_lock(&ctrl->queue_lock);
-		cnt = rsxx_cleanup_dma_queue(card, &ctrl->queue);
-		spin_unlock(&ctrl->queue_lock);
-
-		if (cnt)
-			dev_info(CARD_TO_DEV(card),
-				"Freed %d queued DMAs on channel %d\n",
-				cnt, i);
-
-		/* Clean up issued DMAs */
-		for (j = 0; j < RSXX_MAX_OUTSTANDING_CMDS; j++) {
-			dma = get_tracker_dma(ctrl->trackers, j);
-			if (dma) {
-				pci_unmap_page(card->dev, dma->dma_addr,
-					       get_dma_size(dma),
-					       (dma->cmd == HW_CMD_BLK_WRITE) ?
-					       PCI_DMA_TODEVICE :
-					       PCI_DMA_FROMDEVICE);
-				kmem_cache_free(rsxx_dma_pool, dma);
-				cnt++;
-			}
-		}
+		spin_lock_bh(&ctrl->queue_lock);
+		rsxx_cleanup_dma_queue(ctrl, &ctrl->queue);
+		spin_unlock_bh(&ctrl->queue_lock);
 
-		if (cnt)
-			dev_info(CARD_TO_DEV(card),
-				"Freed %d pending DMAs on channel %d\n",
-				cnt, i);
+		rsxx_dma_cancel(ctrl);
 
 		vfree(ctrl->trackers);
 
@@ -1013,7 +1037,7 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
 			cnt++;
 		}
 
-		spin_lock(&card->ctrl[i].queue_lock);
+		spin_lock_bh(&card->ctrl[i].queue_lock);
 		list_splice(&issued_dmas[i], &card->ctrl[i].queue);
 
 		atomic_sub(cnt, &card->ctrl[i].stats.hw_q_depth);
@@ -1028,7 +1052,7 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
 					       PCI_DMA_TODEVICE :
 					       PCI_DMA_FROMDEVICE);
 		}
-		spin_unlock(&card->ctrl[i].queue_lock);
+		spin_unlock_bh(&card->ctrl[i].queue_lock);
 	}
 
 	kfree(issued_dmas);
@@ -1036,30 +1060,13 @@ int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card)
 	return 0;
 }
 
-void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card)
-{
-	struct rsxx_dma *dma;
-	struct rsxx_dma *tmp;
-	int i;
-
-	for (i = 0; i < card->n_targets; i++) {
-		spin_lock(&card->ctrl[i].queue_lock);
-		list_for_each_entry_safe(dma, tmp, &card->ctrl[i].queue, list) {
-			list_del(&dma->list);
-
-			rsxx_complete_dma(&card->ctrl[i], dma, DMA_CANCELLED);
-		}
-		spin_unlock(&card->ctrl[i].queue_lock);
-	}
-}
-
 int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card)
 {
 	struct rsxx_dma *dma;
 	int i;
 
 	for (i = 0; i < card->n_targets; i++) {
-		spin_lock(&card->ctrl[i].queue_lock);
+		spin_lock_bh(&card->ctrl[i].queue_lock);
 		list_for_each_entry(dma, &card->ctrl[i].queue, list) {
 			dma->dma_addr = pci_map_page(card->dev, dma->page,
 					dma->pg_off, get_dma_size(dma),
@@ -1067,12 +1074,12 @@ int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card)
 					PCI_DMA_TODEVICE :
 					PCI_DMA_FROMDEVICE);
 			if (!dma->dma_addr) {
-				spin_unlock(&card->ctrl[i].queue_lock);
+				spin_unlock_bh(&card->ctrl[i].queue_lock);
 				kmem_cache_free(rsxx_dma_pool, dma);
 				return -ENOMEM;
 			}
 		}
-		spin_unlock(&card->ctrl[i].queue_lock);
+		spin_unlock_bh(&card->ctrl[i].queue_lock);
 	}
 
 	return 0;
diff --git a/drivers/block/rsxx/rsxx_priv.h b/drivers/block/rsxx/rsxx_priv.h
index 382e8bf5c03b..5ad5055a4104 100644
--- a/drivers/block/rsxx/rsxx_priv.h
+++ b/drivers/block/rsxx/rsxx_priv.h
@@ -39,6 +39,7 @@
 #include <linux/vmalloc.h>
 #include <linux/timer.h>
 #include <linux/ioctl.h>
+#include <linux/delay.h>
 
 #include "rsxx.h"
 #include "rsxx_cfg.h"
@@ -114,6 +115,7 @@ struct rsxx_dma_ctrl {
 	struct timer_list		activity_timer;
 	struct dma_tracker_list		*trackers;
 	struct rsxx_dma_stats		stats;
+	struct mutex			work_lock;
 };
 
 struct rsxx_cardinfo {
@@ -134,6 +136,7 @@ struct rsxx_cardinfo {
 		spinlock_t		lock;
 		bool			active;
 		struct creg_cmd		*active_cmd;
+		struct workqueue_struct	*creg_wq;
 		struct work_struct	done_work;
 		struct list_head	queue;
 		unsigned int		q_depth;
@@ -154,6 +157,7 @@ struct rsxx_cardinfo {
 		int buf_len;
 	} log;
 
+	struct workqueue_struct	*event_wq;
 	struct work_struct	event_work;
 	unsigned int		state;
 	u64			size8;
@@ -181,6 +185,8 @@ struct rsxx_cardinfo {
 
 	int			n_targets;
 	struct rsxx_dma_ctrl	*ctrl;
+
+	struct dentry		*debugfs_dir;
 };
 
 enum rsxx_pci_regmap {
@@ -283,6 +289,7 @@ enum rsxx_creg_addr {
 	CREG_ADD_CAPABILITIES		= 0x80001050,
 	CREG_ADD_LOG			= 0x80002000,
 	CREG_ADD_NUM_TARGETS		= 0x80003000,
+	CREG_ADD_CRAM			= 0xA0000000,
 	CREG_ADD_CONFIG			= 0xB0000000,
 };
 
@@ -372,6 +379,8 @@ typedef void (*rsxx_dma_cb)(struct rsxx_cardinfo *card,
 int rsxx_dma_setup(struct rsxx_cardinfo *card);
 void rsxx_dma_destroy(struct rsxx_cardinfo *card);
 int rsxx_dma_init(void);
+int rsxx_cleanup_dma_queue(struct rsxx_dma_ctrl *ctrl, struct list_head *q);
+int rsxx_dma_cancel(struct rsxx_dma_ctrl *ctrl);
 void rsxx_dma_cleanup(void);
 void rsxx_dma_queue_reset(struct rsxx_cardinfo *card);
 int rsxx_dma_configure(struct rsxx_cardinfo *card);
@@ -382,7 +391,6 @@ int rsxx_dma_queue_bio(struct rsxx_cardinfo *card,
 			   void *cb_data);
 int rsxx_hw_buffers_init(struct pci_dev *dev, struct rsxx_dma_ctrl *ctrl);
 int rsxx_eeh_save_issued_dmas(struct rsxx_cardinfo *card);
-void rsxx_eeh_cancel_dmas(struct rsxx_cardinfo *card);
 int rsxx_eeh_remap_dmas(struct rsxx_cardinfo *card);
 
 /***** cregs.c *****/
diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c
index dd5b2fed97e9..bf4b9d282c04 100644
--- a/drivers/block/xen-blkback/blkback.c
+++ b/drivers/block/xen-blkback/blkback.c
@@ -50,110 +50,118 @@
 #include "common.h"
 
 /*
- * These are rather arbitrary. They are fairly large because adjacent requests
- * pulled from a communication ring are quite likely to end up being part of
- * the same scatter/gather request at the disc.
+ * Maximum number of unused free pages to keep in the internal buffer.
+ * Setting this to a value too low will reduce memory used in each backend,
+ * but can have a performance penalty.
  *
- * ** TRY INCREASING 'xen_blkif_reqs' IF WRITE SPEEDS SEEM TOO LOW **
- *
- * This will increase the chances of being able to write whole tracks.
- * 64 should be enough to keep us competitive with Linux.
+ * A sane value is xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST, but can
+ * be set to a lower value that might degrade performance on some intensive
+ * IO workloads.
  */
-static int xen_blkif_reqs = 64;
-module_param_named(reqs, xen_blkif_reqs, int, 0);
-MODULE_PARM_DESC(reqs, "Number of blkback requests to allocate");
 
-/* Run-time switchable: /sys/module/blkback/parameters/ */
-static unsigned int log_stats;
-module_param(log_stats, int, 0644);
+static int xen_blkif_max_buffer_pages = 1024;
+module_param_named(max_buffer_pages, xen_blkif_max_buffer_pages, int, 0644);
+MODULE_PARM_DESC(max_buffer_pages,
+"Maximum number of free pages to keep in each block backend buffer");
 
 /*
- * Each outstanding request that we've passed to the lower device layers has a
- * 'pending_req' allocated to it. Each buffer_head that completes decrements
- * the pendcnt towards zero. When it hits zero, the specified domain has a
- * response queued for it, with the saved 'id' passed back.
+ * Maximum number of grants to map persistently in blkback. For maximum
+ * performance this should be the total numbers of grants that can be used
+ * to fill the ring, but since this might become too high, specially with
+ * the use of indirect descriptors, we set it to a value that provides good
+ * performance without using too much memory.
+ *
+ * When the list of persistent grants is full we clean it up using a LRU
+ * algorithm.
  */
-struct pending_req {
-	struct xen_blkif	*blkif;
-	u64			id;
-	int			nr_pages;
-	atomic_t		pendcnt;
-	unsigned short		operation;
-	int			status;
-	struct list_head	free_list;
-	DECLARE_BITMAP(unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
-};
 
-#define BLKBACK_INVALID_HANDLE (~0)
+static int xen_blkif_max_pgrants = 1056;
+module_param_named(max_persistent_grants, xen_blkif_max_pgrants, int, 0644);
+MODULE_PARM_DESC(max_persistent_grants,
+                 "Maximum number of grants to map persistently");
 
-struct xen_blkbk {
-	struct pending_req	*pending_reqs;
-	/* List of all 'pending_req' available */
-	struct list_head	pending_free;
-	/* And its spinlock. */
-	spinlock_t		pending_free_lock;
-	wait_queue_head_t	pending_free_wq;
-	/* The list of all pages that are available. */
-	struct page		**pending_pages;
-	/* And the grant handles that are available. */
-	grant_handle_t		*pending_grant_handles;
-};
-
-static struct xen_blkbk *blkbk;
+/*
+ * The LRU mechanism to clean the lists of persistent grants needs to
+ * be executed periodically. The time interval between consecutive executions
+ * of the purge mechanism is set in ms.
+ */
+#define LRU_INTERVAL 100
 
 /*
- * Maximum number of grant pages that can be mapped in blkback.
- * BLKIF_MAX_SEGMENTS_PER_REQUEST * RING_SIZE is the maximum number of
- * pages that blkback will persistently map.
- * Currently, this is:
- * RING_SIZE = 32 (for all known ring types)
- * BLKIF_MAX_SEGMENTS_PER_REQUEST = 11
- * sizeof(struct persistent_gnt) = 48
- * So the maximum memory used to store the grants is:
- * 32 * 11 * 48 = 16896 bytes
+ * When the persistent grants list is full we will remove unused grants
+ * from the list. The percent number of grants to be removed at each LRU
+ * execution.
  */
-static inline unsigned int max_mapped_grant_pages(enum blkif_protocol protocol)
+#define LRU_PERCENT_CLEAN 5
+
+/* Run-time switchable: /sys/module/blkback/parameters/ */
+static unsigned int log_stats;
+module_param(log_stats, int, 0644);
+
+#define BLKBACK_INVALID_HANDLE (~0)
+
+/* Number of free pages to remove on each call to free_xenballooned_pages */
+#define NUM_BATCH_FREE_PAGES 10
+
+static inline int get_free_page(struct xen_blkif *blkif, struct page **page)
 {
-	switch (protocol) {
-	case BLKIF_PROTOCOL_NATIVE:
-		return __CONST_RING_SIZE(blkif, PAGE_SIZE) *
-			   BLKIF_MAX_SEGMENTS_PER_REQUEST;
-	case BLKIF_PROTOCOL_X86_32:
-		return __CONST_RING_SIZE(blkif_x86_32, PAGE_SIZE) *
-			   BLKIF_MAX_SEGMENTS_PER_REQUEST;
-	case BLKIF_PROTOCOL_X86_64:
-		return __CONST_RING_SIZE(blkif_x86_64, PAGE_SIZE) *
-			   BLKIF_MAX_SEGMENTS_PER_REQUEST;
-	default:
-		BUG();
+	unsigned long flags;
+
+	spin_lock_irqsave(&blkif->free_pages_lock, flags);
+	if (list_empty(&blkif->free_pages)) {
+		BUG_ON(blkif->free_pages_num != 0);
+		spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+		return alloc_xenballooned_pages(1, page, false);
 	}
+	BUG_ON(blkif->free_pages_num == 0);
+	page[0] = list_first_entry(&blkif->free_pages, struct page, lru);
+	list_del(&page[0]->lru);
+	blkif->free_pages_num--;
+	spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+
 	return 0;
 }
 
-
-/*
- * Little helpful macro to figure out the index and virtual address of the
- * pending_pages[..]. For each 'pending_req' we have have up to
- * BLKIF_MAX_SEGMENTS_PER_REQUEST (11) pages. The seg would be from 0 through
- * 10 and would index in the pending_pages[..].
- */
-static inline int vaddr_pagenr(struct pending_req *req, int seg)
+static inline void put_free_pages(struct xen_blkif *blkif, struct page **page,
+                                  int num)
 {
-	return (req - blkbk->pending_reqs) *
-		BLKIF_MAX_SEGMENTS_PER_REQUEST + seg;
-}
+	unsigned long flags;
+	int i;
 
-#define pending_page(req, seg) pending_pages[vaddr_pagenr(req, seg)]
+	spin_lock_irqsave(&blkif->free_pages_lock, flags);
+	for (i = 0; i < num; i++)
+		list_add(&page[i]->lru, &blkif->free_pages);
+	blkif->free_pages_num += num;
+	spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+}
 
-static inline unsigned long vaddr(struct pending_req *req, int seg)
+static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num)
 {
-	unsigned long pfn = page_to_pfn(blkbk->pending_page(req, seg));
-	return (unsigned long)pfn_to_kaddr(pfn);
-}
+	/* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */
+	struct page *page[NUM_BATCH_FREE_PAGES];
+	unsigned int num_pages = 0;
+	unsigned long flags;
 
-#define pending_handle(_req, _seg) \
-	(blkbk->pending_grant_handles[vaddr_pagenr(_req, _seg)])
+	spin_lock_irqsave(&blkif->free_pages_lock, flags);
+	while (blkif->free_pages_num > num) {
+		BUG_ON(list_empty(&blkif->free_pages));
+		page[num_pages] = list_first_entry(&blkif->free_pages,
+		                                   struct page, lru);
+		list_del(&page[num_pages]->lru);
+		blkif->free_pages_num--;
+		if (++num_pages == NUM_BATCH_FREE_PAGES) {
+			spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+			free_xenballooned_pages(num_pages, page);
+			spin_lock_irqsave(&blkif->free_pages_lock, flags);
+			num_pages = 0;
+		}
+	}
+	spin_unlock_irqrestore(&blkif->free_pages_lock, flags);
+	if (num_pages != 0)
+		free_xenballooned_pages(num_pages, page);
+}
 
+#define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page)))
 
 static int do_block_io_op(struct xen_blkif *blkif);
 static int dispatch_rw_block_io(struct xen_blkif *blkif,
@@ -170,13 +178,29 @@ static void make_response(struct xen_blkif *blkif, u64 id,
 	     (n) = (&(pos)->node != NULL) ? rb_next(&(pos)->node) : NULL)
 
 
-static void add_persistent_gnt(struct rb_root *root,
+/*
+ * We don't need locking around the persistent grant helpers
+ * because blkback uses a single-thread for each backed, so we
+ * can be sure that this functions will never be called recursively.
+ *
+ * The only exception to that is put_persistent_grant, that can be called
+ * from interrupt context (by xen_blkbk_unmap), so we have to use atomic
+ * bit operations to modify the flags of a persistent grant and to count
+ * the number of used grants.
+ */
+static int add_persistent_gnt(struct xen_blkif *blkif,
 			       struct persistent_gnt *persistent_gnt)
 {
-	struct rb_node **new = &(root->rb_node), *parent = NULL;
+	struct rb_node **new = NULL, *parent = NULL;
 	struct persistent_gnt *this;
 
+	if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) {
+		if (!blkif->vbd.overflow_max_grants)
+			blkif->vbd.overflow_max_grants = 1;
+		return -EBUSY;
+	}
 	/* Figure out where to put new node */
+	new = &blkif->persistent_gnts.rb_node;
 	while (*new) {
 		this = container_of(*new, struct persistent_gnt, node);
 
@@ -186,22 +210,28 @@ static void add_persistent_gnt(struct rb_root *root,
 		else if (persistent_gnt->gnt > this->gnt)
 			new = &((*new)->rb_right);
 		else {
-			pr_alert(DRV_PFX " trying to add a gref that's already in the tree\n");
-			BUG();
+			pr_alert_ratelimited(DRV_PFX " trying to add a gref that's already in the tree\n");
+			return -EINVAL;
 		}
 	}
 
+	bitmap_zero(persistent_gnt->flags, PERSISTENT_GNT_FLAGS_SIZE);
+	set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
 	/* Add new node and rebalance tree. */
 	rb_link_node(&(persistent_gnt->node), parent, new);
-	rb_insert_color(&(persistent_gnt->node), root);
+	rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts);
+	blkif->persistent_gnt_c++;
+	atomic_inc(&blkif->persistent_gnt_in_use);
+	return 0;
 }
 
-static struct persistent_gnt *get_persistent_gnt(struct rb_root *root,
+static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif,
 						 grant_ref_t gref)
 {
 	struct persistent_gnt *data;
-	struct rb_node *node = root->rb_node;
+	struct rb_node *node = NULL;
 
+	node = blkif->persistent_gnts.rb_node;
 	while (node) {
 		data = container_of(node, struct persistent_gnt, node);
 
@@ -209,13 +239,31 @@ static struct persistent_gnt *get_persistent_gnt(struct rb_root *root,
 			node = node->rb_left;
 		else if (gref > data->gnt)
 			node = node->rb_right;
-		else
+		else {
+			if(test_bit(PERSISTENT_GNT_ACTIVE, data->flags)) {
+				pr_alert_ratelimited(DRV_PFX " requesting a grant already in use\n");
+				return NULL;
+			}
+			set_bit(PERSISTENT_GNT_ACTIVE, data->flags);
+			atomic_inc(&blkif->persistent_gnt_in_use);
 			return data;
+		}
 	}
 	return NULL;
 }
 
-static void free_persistent_gnts(struct rb_root *root, unsigned int num)
+static void put_persistent_gnt(struct xen_blkif *blkif,
+                               struct persistent_gnt *persistent_gnt)
+{
+	if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
+	          pr_alert_ratelimited(DRV_PFX " freeing a grant already unused");
+	set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
+	clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags);
+	atomic_dec(&blkif->persistent_gnt_in_use);
+}
+
+static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root,
+                                 unsigned int num)
 {
 	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
@@ -240,7 +288,7 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num)
 			ret = gnttab_unmap_refs(unmap, NULL, pages,
 				segs_to_unmap);
 			BUG_ON(ret);
-			free_xenballooned_pages(segs_to_unmap, pages);
+			put_free_pages(blkif, pages, segs_to_unmap);
 			segs_to_unmap = 0;
 		}
 
@@ -251,21 +299,148 @@ static void free_persistent_gnts(struct rb_root *root, unsigned int num)
 	BUG_ON(num != 0);
 }
 
+static void unmap_purged_grants(struct work_struct *work)
+{
+	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct persistent_gnt *persistent_gnt;
+	int ret, segs_to_unmap = 0;
+	struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work);
+
+	while(!list_empty(&blkif->persistent_purge_list)) {
+		persistent_gnt = list_first_entry(&blkif->persistent_purge_list,
+		                                  struct persistent_gnt,
+		                                  remove_node);
+		list_del(&persistent_gnt->remove_node);
+
+		gnttab_set_unmap_op(&unmap[segs_to_unmap],
+			vaddr(persistent_gnt->page),
+			GNTMAP_host_map,
+			persistent_gnt->handle);
+
+		pages[segs_to_unmap] = persistent_gnt->page;
+
+		if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+			ret = gnttab_unmap_refs(unmap, NULL, pages,
+				segs_to_unmap);
+			BUG_ON(ret);
+			put_free_pages(blkif, pages, segs_to_unmap);
+			segs_to_unmap = 0;
+		}
+		kfree(persistent_gnt);
+	}
+	if (segs_to_unmap > 0) {
+		ret = gnttab_unmap_refs(unmap, NULL, pages, segs_to_unmap);
+		BUG_ON(ret);
+		put_free_pages(blkif, pages, segs_to_unmap);
+	}
+}
+
+static void purge_persistent_gnt(struct xen_blkif *blkif)
+{
+	struct persistent_gnt *persistent_gnt;
+	struct rb_node *n;
+	unsigned int num_clean, total;
+	bool scan_used = false, clean_used = false;
+	struct rb_root *root;
+
+	if (blkif->persistent_gnt_c < xen_blkif_max_pgrants ||
+	    (blkif->persistent_gnt_c == xen_blkif_max_pgrants &&
+	    !blkif->vbd.overflow_max_grants)) {
+		return;
+	}
+
+	if (work_pending(&blkif->persistent_purge_work)) {
+		pr_alert_ratelimited(DRV_PFX "Scheduled work from previous purge is still pending, cannot purge list\n");
+		return;
+	}
+
+	num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN;
+	num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean;
+	num_clean = min(blkif->persistent_gnt_c, num_clean);
+	if ((num_clean == 0) ||
+	    (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use))))
+		return;
+
+	/*
+	 * At this point, we can assure that there will be no calls
+         * to get_persistent_grant (because we are executing this code from
+         * xen_blkif_schedule), there can only be calls to put_persistent_gnt,
+         * which means that the number of currently used grants will go down,
+         * but never up, so we will always be able to remove the requested
+         * number of grants.
+	 */
+
+	total = num_clean;
+
+	pr_debug(DRV_PFX "Going to purge %u persistent grants\n", num_clean);
+
+	INIT_LIST_HEAD(&blkif->persistent_purge_list);
+	root = &blkif->persistent_gnts;
+purge_list:
+	foreach_grant_safe(persistent_gnt, n, root, node) {
+		BUG_ON(persistent_gnt->handle ==
+			BLKBACK_INVALID_HANDLE);
+
+		if (clean_used) {
+			clear_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags);
+			continue;
+		}
+
+		if (test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags))
+			continue;
+		if (!scan_used &&
+		    (test_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags)))
+			continue;
+
+		rb_erase(&persistent_gnt->node, root);
+		list_add(&persistent_gnt->remove_node,
+		         &blkif->persistent_purge_list);
+		if (--num_clean == 0)
+			goto finished;
+	}
+	/*
+	 * If we get here it means we also need to start cleaning
+	 * grants that were used since last purge in order to cope
+	 * with the requested num
+	 */
+	if (!scan_used && !clean_used) {
+		pr_debug(DRV_PFX "Still missing %u purged frames\n", num_clean);
+		scan_used = true;
+		goto purge_list;
+	}
+finished:
+	if (!clean_used) {
+		pr_debug(DRV_PFX "Finished scanning for grants to clean, removing used flag\n");
+		clean_used = true;
+		goto purge_list;
+	}
+
+	blkif->persistent_gnt_c -= (total - num_clean);
+	blkif->vbd.overflow_max_grants = 0;
+
+	/* We can defer this work */
+	INIT_WORK(&blkif->persistent_purge_work, unmap_purged_grants);
+	schedule_work(&blkif->persistent_purge_work);
+	pr_debug(DRV_PFX "Purged %u/%u\n", (total - num_clean), total);
+	return;
+}
+
 /*
  * Retrieve from the 'pending_reqs' a free pending_req structure to be used.
  */
-static struct pending_req *alloc_req(void)
+static struct pending_req *alloc_req(struct xen_blkif *blkif)
 {
 	struct pending_req *req = NULL;
 	unsigned long flags;
 
-	spin_lock_irqsave(&blkbk->pending_free_lock, flags);
-	if (!list_empty(&blkbk->pending_free)) {
-		req = list_entry(blkbk->pending_free.next, struct pending_req,
+	spin_lock_irqsave(&blkif->pending_free_lock, flags);
+	if (!list_empty(&blkif->pending_free)) {
+		req = list_entry(blkif->pending_free.next, struct pending_req,
 				 free_list);
 		list_del(&req->free_list);
 	}
-	spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
+	spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
 	return req;
 }
 
@@ -273,17 +448,17 @@ static struct pending_req *alloc_req(void)
  * Return the 'pending_req' structure back to the freepool. We also
  * wake up the thread if it was waiting for a free page.
  */
-static void free_req(struct pending_req *req)
+static void free_req(struct xen_blkif *blkif, struct pending_req *req)
 {
 	unsigned long flags;
 	int was_empty;
 
-	spin_lock_irqsave(&blkbk->pending_free_lock, flags);
-	was_empty = list_empty(&blkbk->pending_free);
-	list_add(&req->free_list, &blkbk->pending_free);
-	spin_unlock_irqrestore(&blkbk->pending_free_lock, flags);
+	spin_lock_irqsave(&blkif->pending_free_lock, flags);
+	was_empty = list_empty(&blkif->pending_free);
+	list_add(&req->free_list, &blkif->pending_free);
+	spin_unlock_irqrestore(&blkif->pending_free_lock, flags);
 	if (was_empty)
-		wake_up(&blkbk->pending_free_wq);
+		wake_up(&blkif->pending_free_wq);
 }
 
 /*
@@ -382,10 +557,12 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id)
 static void print_stats(struct xen_blkif *blkif)
 {
 	pr_info("xen-blkback (%s): oo %3llu  |  rd %4llu  |  wr %4llu  |  f %4llu"
-		 "  |  ds %4llu\n",
+		 "  |  ds %4llu | pg: %4u/%4d\n",
 		 current->comm, blkif->st_oo_req,
 		 blkif->st_rd_req, blkif->st_wr_req,
-		 blkif->st_f_req, blkif->st_ds_req);
+		 blkif->st_f_req, blkif->st_ds_req,
+		 blkif->persistent_gnt_c,
+		 xen_blkif_max_pgrants);
 	blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000);
 	blkif->st_rd_req = 0;
 	blkif->st_wr_req = 0;
@@ -397,6 +574,8 @@ int xen_blkif_schedule(void *arg)
 {
 	struct xen_blkif *blkif = arg;
 	struct xen_vbd *vbd = &blkif->vbd;
+	unsigned long timeout;
+	int ret;
 
 	xen_blkif_get(blkif);
 
@@ -406,27 +585,52 @@ int xen_blkif_schedule(void *arg)
 		if (unlikely(vbd->size != vbd_sz(vbd)))
 			xen_vbd_resize(blkif);
 
-		wait_event_interruptible(
+		timeout = msecs_to_jiffies(LRU_INTERVAL);
+
+		timeout = wait_event_interruptible_timeout(
 			blkif->wq,
-			blkif->waiting_reqs || kthread_should_stop());
-		wait_event_interruptible(
-			blkbk->pending_free_wq,
-			!list_empty(&blkbk->pending_free) ||
-			kthread_should_stop());
+			blkif->waiting_reqs || kthread_should_stop(),
+			timeout);
+		if (timeout == 0)
+			goto purge_gnt_list;
+		timeout = wait_event_interruptible_timeout(
+			blkif->pending_free_wq,
+			!list_empty(&blkif->pending_free) ||
+			kthread_should_stop(),
+			timeout);
+		if (timeout == 0)
+			goto purge_gnt_list;
 
 		blkif->waiting_reqs = 0;
 		smp_mb(); /* clear flag *before* checking for work */
 
-		if (do_block_io_op(blkif))
+		ret = do_block_io_op(blkif);
+		if (ret > 0)
 			blkif->waiting_reqs = 1;
+		if (ret == -EACCES)
+			wait_event_interruptible(blkif->shutdown_wq,
+						 kthread_should_stop());
+
+purge_gnt_list:
+		if (blkif->vbd.feature_gnt_persistent &&
+		    time_after(jiffies, blkif->next_lru)) {
+			purge_persistent_gnt(blkif);
+			blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL);
+		}
+
+		/* Shrink if we have more than xen_blkif_max_buffer_pages */
+		shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages);
 
 		if (log_stats && time_after(jiffies, blkif->st_print))
 			print_stats(blkif);
 	}
 
+	/* Since we are shutting down remove all pages from the buffer */
+	shrink_free_pagepool(blkif, 0 /* All */);
+
 	/* Free all persistent grant pages */
 	if (!RB_EMPTY_ROOT(&blkif->persistent_gnts))
-		free_persistent_gnts(&blkif->persistent_gnts,
+		free_persistent_gnts(blkif, &blkif->persistent_gnts,
 			blkif->persistent_gnt_c);
 
 	BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts));
@@ -441,148 +645,98 @@ int xen_blkif_schedule(void *arg)
 	return 0;
 }
 
-struct seg_buf {
-	unsigned int offset;
-	unsigned int nsec;
-};
 /*
  * Unmap the grant references, and also remove the M2P over-rides
  * used in the 'pending_req'.
  */
-static void xen_blkbk_unmap(struct pending_req *req)
+static void xen_blkbk_unmap(struct xen_blkif *blkif,
+                            struct grant_page *pages[],
+                            int num)
 {
 	struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct page *unmap_pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	unsigned int i, invcount = 0;
-	grant_handle_t handle;
 	int ret;
 
-	for (i = 0; i < req->nr_pages; i++) {
-		if (!test_bit(i, req->unmap_seg))
+	for (i = 0; i < num; i++) {
+		if (pages[i]->persistent_gnt != NULL) {
+			put_persistent_gnt(blkif, pages[i]->persistent_gnt);
 			continue;
-		handle = pending_handle(req, i);
-		if (handle == BLKBACK_INVALID_HANDLE)
+		}
+		if (pages[i]->handle == BLKBACK_INVALID_HANDLE)
 			continue;
-		gnttab_set_unmap_op(&unmap[invcount], vaddr(req, i),
-				    GNTMAP_host_map, handle);
-		pending_handle(req, i) = BLKBACK_INVALID_HANDLE;
-		pages[invcount] = virt_to_page(vaddr(req, i));
-		invcount++;
+		unmap_pages[invcount] = pages[i]->page;
+		gnttab_set_unmap_op(&unmap[invcount], vaddr(pages[i]->page),
+				    GNTMAP_host_map, pages[i]->handle);
+		pages[i]->handle = BLKBACK_INVALID_HANDLE;
+		if (++invcount == BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+			ret = gnttab_unmap_refs(unmap, NULL, unmap_pages,
+			                        invcount);
+			BUG_ON(ret);
+			put_free_pages(blkif, unmap_pages, invcount);
+			invcount = 0;
+		}
+	}
+	if (invcount) {
+		ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount);
+		BUG_ON(ret);
+		put_free_pages(blkif, unmap_pages, invcount);
 	}
-
-	ret = gnttab_unmap_refs(unmap, NULL, pages, invcount);
-	BUG_ON(ret);
 }
 
-static int xen_blkbk_map(struct blkif_request *req,
-			 struct pending_req *pending_req,
-			 struct seg_buf seg[],
-			 struct page *pages[])
+static int xen_blkbk_map(struct xen_blkif *blkif,
+			 struct grant_page *pages[],
+			 int num, bool ro)
 {
 	struct gnttab_map_grant_ref map[BLKIF_MAX_SEGMENTS_PER_REQUEST];
-	struct persistent_gnt *persistent_gnts[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	struct page *pages_to_gnt[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	struct persistent_gnt *persistent_gnt = NULL;
-	struct xen_blkif *blkif = pending_req->blkif;
 	phys_addr_t addr = 0;
-	int i, j;
-	bool new_map;
-	int nseg = req->u.rw.nr_segments;
+	int i, seg_idx, new_map_idx;
 	int segs_to_map = 0;
 	int ret = 0;
+	int last_map = 0, map_until = 0;
 	int use_persistent_gnts;
 
 	use_persistent_gnts = (blkif->vbd.feature_gnt_persistent);
 
-	BUG_ON(blkif->persistent_gnt_c >
-		   max_mapped_grant_pages(pending_req->blkif->blk_protocol));
-
 	/*
 	 * Fill out preq.nr_sects with proper amount of sectors, and setup
 	 * assign map[..] with the PFN of the page in our domain with the
 	 * corresponding grant reference for each page.
 	 */
-	for (i = 0; i < nseg; i++) {
+again:
+	for (i = map_until; i < num; i++) {
 		uint32_t flags;
 
 		if (use_persistent_gnts)
 			persistent_gnt = get_persistent_gnt(
-				&blkif->persistent_gnts,
-				req->u.rw.seg[i].gref);
+				blkif,
+				pages[i]->gref);
 
 		if (persistent_gnt) {
 			/*
 			 * We are using persistent grants and
 			 * the grant is already mapped
 			 */
-			new_map = false;
-		} else if (use_persistent_gnts &&
-			   blkif->persistent_gnt_c <
-			   max_mapped_grant_pages(blkif->blk_protocol)) {
-			/*
-			 * We are using persistent grants, the grant is
-			 * not mapped but we have room for it
-			 */
-			new_map = true;
-			persistent_gnt = kmalloc(
-				sizeof(struct persistent_gnt),
-				GFP_KERNEL);
-			if (!persistent_gnt)
-				return -ENOMEM;
-			if (alloc_xenballooned_pages(1, &persistent_gnt->page,
-			    false)) {
-				kfree(persistent_gnt);
-				return -ENOMEM;
-			}
-			persistent_gnt->gnt = req->u.rw.seg[i].gref;
-			persistent_gnt->handle = BLKBACK_INVALID_HANDLE;
-
-			pages_to_gnt[segs_to_map] =
-				persistent_gnt->page;
-			addr = (unsigned long) pfn_to_kaddr(
-				page_to_pfn(persistent_gnt->page));
-
-			add_persistent_gnt(&blkif->persistent_gnts,
-				persistent_gnt);
-			blkif->persistent_gnt_c++;
-			pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
-				 persistent_gnt->gnt, blkif->persistent_gnt_c,
-				 max_mapped_grant_pages(blkif->blk_protocol));
+			pages[i]->page = persistent_gnt->page;
+			pages[i]->persistent_gnt = persistent_gnt;
 		} else {
-			/*
-			 * We are either using persistent grants and
-			 * hit the maximum limit of grants mapped,
-			 * or we are not using persistent grants.
-			 */
-			if (use_persistent_gnts &&
-				!blkif->vbd.overflow_max_grants) {
-				blkif->vbd.overflow_max_grants = 1;
-				pr_alert(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
-					 blkif->domid, blkif->vbd.handle);
-			}
-			new_map = true;
-			pages[i] = blkbk->pending_page(pending_req, i);
-			addr = vaddr(pending_req, i);
-			pages_to_gnt[segs_to_map] =
-				blkbk->pending_page(pending_req, i);
-		}
-
-		if (persistent_gnt) {
-			pages[i] = persistent_gnt->page;
-			persistent_gnts[i] = persistent_gnt;
-		} else {
-			persistent_gnts[i] = NULL;
-		}
-
-		if (new_map) {
+			if (get_free_page(blkif, &pages[i]->page))
+				goto out_of_memory;
+			addr = vaddr(pages[i]->page);
+			pages_to_gnt[segs_to_map] = pages[i]->page;
+			pages[i]->persistent_gnt = NULL;
 			flags = GNTMAP_host_map;
-			if (!persistent_gnt &&
-			    (pending_req->operation != BLKIF_OP_READ))
+			if (!use_persistent_gnts && ro)
 				flags |= GNTMAP_readonly;
 			gnttab_set_map_op(&map[segs_to_map++], addr,
-					  flags, req->u.rw.seg[i].gref,
+					  flags, pages[i]->gref,
 					  blkif->domid);
 		}
+		map_until = i + 1;
+		if (segs_to_map == BLKIF_MAX_SEGMENTS_PER_REQUEST)
+			break;
 	}
 
 	if (segs_to_map) {
@@ -595,49 +749,133 @@ static int xen_blkbk_map(struct blkif_request *req,
 	 * so that when we access vaddr(pending_req,i) it has the contents of
 	 * the page from the other domain.
 	 */
-	bitmap_zero(pending_req->unmap_seg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
-	for (i = 0, j = 0; i < nseg; i++) {
-		if (!persistent_gnts[i] ||
-		    persistent_gnts[i]->handle == BLKBACK_INVALID_HANDLE) {
+	for (seg_idx = last_map, new_map_idx = 0; seg_idx < map_until; seg_idx++) {
+		if (!pages[seg_idx]->persistent_gnt) {
 			/* This is a newly mapped grant */
-			BUG_ON(j >= segs_to_map);
-			if (unlikely(map[j].status != 0)) {
+			BUG_ON(new_map_idx >= segs_to_map);
+			if (unlikely(map[new_map_idx].status != 0)) {
 				pr_debug(DRV_PFX "invalid buffer -- could not remap it\n");
-				map[j].handle = BLKBACK_INVALID_HANDLE;
+				pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE;
 				ret |= 1;
-				if (persistent_gnts[i]) {
-					rb_erase(&persistent_gnts[i]->node,
-						 &blkif->persistent_gnts);
-					blkif->persistent_gnt_c--;
-					kfree(persistent_gnts[i]);
-					persistent_gnts[i] = NULL;
-				}
+				goto next;
 			}
+			pages[seg_idx]->handle = map[new_map_idx].handle;
+		} else {
+			continue;
 		}
-		if (persistent_gnts[i]) {
-			if (persistent_gnts[i]->handle ==
-			    BLKBACK_INVALID_HANDLE) {
+		if (use_persistent_gnts &&
+		    blkif->persistent_gnt_c < xen_blkif_max_pgrants) {
+			/*
+			 * We are using persistent grants, the grant is
+			 * not mapped but we might have room for it.
+			 */
+			persistent_gnt = kmalloc(sizeof(struct persistent_gnt),
+				                 GFP_KERNEL);
+			if (!persistent_gnt) {
 				/*
-				 * If this is a new persistent grant
-				 * save the handler
+				 * If we don't have enough memory to
+				 * allocate the persistent_gnt struct
+				 * map this grant non-persistenly
 				 */
-				persistent_gnts[i]->handle = map[j++].handle;
+				goto next;
 			}
-			pending_handle(pending_req, i) =
-				persistent_gnts[i]->handle;
+			persistent_gnt->gnt = map[new_map_idx].ref;
+			persistent_gnt->handle = map[new_map_idx].handle;
+			persistent_gnt->page = pages[seg_idx]->page;
+			if (add_persistent_gnt(blkif,
+			                       persistent_gnt)) {
+				kfree(persistent_gnt);
+				persistent_gnt = NULL;
+				goto next;
+			}
+			pages[seg_idx]->persistent_gnt = persistent_gnt;
+			pr_debug(DRV_PFX " grant %u added to the tree of persistent grants, using %u/%u\n",
+				 persistent_gnt->gnt, blkif->persistent_gnt_c,
+				 xen_blkif_max_pgrants);
+			goto next;
+		}
+		if (use_persistent_gnts && !blkif->vbd.overflow_max_grants) {
+			blkif->vbd.overflow_max_grants = 1;
+			pr_debug(DRV_PFX " domain %u, device %#x is using maximum number of persistent grants\n",
+			         blkif->domid, blkif->vbd.handle);
+		}
+		/*
+		 * We could not map this grant persistently, so use it as
+		 * a non-persistent grant.
+		 */
+next:
+		new_map_idx++;
+	}
+	segs_to_map = 0;
+	last_map = map_until;
+	if (map_until != num)
+		goto again;
 
-			if (ret)
-				continue;
-		} else {
-			pending_handle(pending_req, i) = map[j++].handle;
-			bitmap_set(pending_req->unmap_seg, i, 1);
+	return ret;
+
+out_of_memory:
+	pr_alert(DRV_PFX "%s: out of memory\n", __func__);
+	put_free_pages(blkif, pages_to_gnt, segs_to_map);
+	return -ENOMEM;
+}
+
+static int xen_blkbk_map_seg(struct pending_req *pending_req)
+{
+	int rc;
+
+	rc = xen_blkbk_map(pending_req->blkif, pending_req->segments,
+			   pending_req->nr_pages,
+	                   (pending_req->operation != BLKIF_OP_READ));
+
+	return rc;
+}
 
-			if (ret)
-				continue;
+static int xen_blkbk_parse_indirect(struct blkif_request *req,
+				    struct pending_req *pending_req,
+				    struct seg_buf seg[],
+				    struct phys_req *preq)
+{
+	struct grant_page **pages = pending_req->indirect_pages;
+	struct xen_blkif *blkif = pending_req->blkif;
+	int indirect_grefs, rc, n, nseg, i;
+	struct blkif_request_segment_aligned *segments = NULL;
+
+	nseg = pending_req->nr_pages;
+	indirect_grefs = INDIRECT_PAGES(nseg);
+	BUG_ON(indirect_grefs > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
+
+	for (i = 0; i < indirect_grefs; i++)
+		pages[i]->gref = req->u.indirect.indirect_grefs[i];
+
+	rc = xen_blkbk_map(blkif, pages, indirect_grefs, true);
+	if (rc)
+		goto unmap;
+
+	for (n = 0, i = 0; n < nseg; n++) {
+		if ((n % SEGS_PER_INDIRECT_FRAME) == 0) {
+			/* Map indirect segments */
+			if (segments)
+				kunmap_atomic(segments);
+			segments = kmap_atomic(pages[n/SEGS_PER_INDIRECT_FRAME]->page);
+		}
+		i = n % SEGS_PER_INDIRECT_FRAME;
+		pending_req->segments[n]->gref = segments[i].gref;
+		seg[n].nsec = segments[i].last_sect -
+			segments[i].first_sect + 1;
+		seg[n].offset = (segments[i].first_sect << 9);
+		if ((segments[i].last_sect >= (PAGE_SIZE >> 9)) ||
+		    (segments[i].last_sect < segments[i].first_sect)) {
+			rc = -EINVAL;
+			goto unmap;
 		}
-		seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
+		preq->nr_sects += seg[n].nsec;
 	}
-	return ret;
+
+unmap:
+	if (segments)
+		kunmap_atomic(segments);
+	xen_blkbk_unmap(blkif, pages, indirect_grefs);
+	return rc;
 }
 
 static int dispatch_discard_io(struct xen_blkif *blkif,
@@ -647,7 +885,18 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
 	int status = BLKIF_RSP_OKAY;
 	struct block_device *bdev = blkif->vbd.bdev;
 	unsigned long secure;
+	struct phys_req preq;
+
+	preq.sector_number = req->u.discard.sector_number;
+	preq.nr_sects      = req->u.discard.nr_sectors;
 
+	err = xen_vbd_translate(&preq, blkif, WRITE);
+	if (err) {
+		pr_warn(DRV_PFX "access denied: DISCARD [%llu->%llu] on dev=%04x\n",
+			preq.sector_number,
+			preq.sector_number + preq.nr_sects, blkif->vbd.pdevice);
+		goto fail_response;
+	}
 	blkif->st_ds_req++;
 
 	xen_blkif_get(blkif);
@@ -658,7 +907,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif,
 	err = blkdev_issue_discard(bdev, req->u.discard.sector_number,
 				   req->u.discard.nr_sectors,
 				   GFP_KERNEL, secure);
-
+fail_response:
 	if (err == -EOPNOTSUPP) {
 		pr_debug(DRV_PFX "discard op failed, not supported\n");
 		status = BLKIF_RSP_EOPNOTSUPP;
@@ -674,7 +923,7 @@ static int dispatch_other_io(struct xen_blkif *blkif,
 			     struct blkif_request *req,
 			     struct pending_req *pending_req)
 {
-	free_req(pending_req);
+	free_req(blkif, pending_req);
 	make_response(blkif, req->u.other.id, req->operation,
 		      BLKIF_RSP_EOPNOTSUPP);
 	return -EIO;
@@ -726,7 +975,9 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
 	 * the proper response on the ring.
 	 */
 	if (atomic_dec_and_test(&pending_req->pendcnt)) {
-		xen_blkbk_unmap(pending_req);
+		xen_blkbk_unmap(pending_req->blkif,
+		                pending_req->segments,
+		                pending_req->nr_pages);
 		make_response(pending_req->blkif, pending_req->id,
 			      pending_req->operation, pending_req->status);
 		xen_blkif_put(pending_req->blkif);
@@ -734,7 +985,7 @@ static void __end_block_io_op(struct pending_req *pending_req, int error)
 			if (atomic_read(&pending_req->blkif->drain))
 				complete(&pending_req->blkif->drain_complete);
 		}
-		free_req(pending_req);
+		free_req(pending_req->blkif, pending_req);
 	}
 }
 
@@ -767,6 +1018,12 @@ __do_block_io_op(struct xen_blkif *blkif)
 	rp = blk_rings->common.sring->req_prod;
 	rmb(); /* Ensure we see queued requests up to 'rp'. */
 
+	if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) {
+		rc = blk_rings->common.rsp_prod_pvt;
+		pr_warn(DRV_PFX "Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n",
+			rp, rc, rp - rc, blkif->vbd.pdevice);
+		return -EACCES;
+	}
 	while (rc != rp) {
 
 		if (RING_REQUEST_CONS_OVERFLOW(&blk_rings->common, rc))
@@ -777,7 +1034,7 @@ __do_block_io_op(struct xen_blkif *blkif)
 			break;
 		}
 
-		pending_req = alloc_req();
+		pending_req = alloc_req(blkif);
 		if (NULL == pending_req) {
 			blkif->st_oo_req++;
 			more_to_do = 1;
@@ -807,11 +1064,12 @@ __do_block_io_op(struct xen_blkif *blkif)
 		case BLKIF_OP_WRITE:
 		case BLKIF_OP_WRITE_BARRIER:
 		case BLKIF_OP_FLUSH_DISKCACHE:
+		case BLKIF_OP_INDIRECT:
 			if (dispatch_rw_block_io(blkif, &req, pending_req))
 				goto done;
 			break;
 		case BLKIF_OP_DISCARD:
-			free_req(pending_req);
+			free_req(blkif, pending_req);
 			if (dispatch_discard_io(blkif, &req))
 				goto done;
 			break;
@@ -853,17 +1111,28 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 				struct pending_req *pending_req)
 {
 	struct phys_req preq;
-	struct seg_buf seg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct seg_buf *seg = pending_req->seg;
 	unsigned int nseg;
 	struct bio *bio = NULL;
-	struct bio *biolist[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct bio **biolist = pending_req->biolist;
 	int i, nbio = 0;
 	int operation;
 	struct blk_plug plug;
 	bool drain = false;
-	struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct grant_page **pages = pending_req->segments;
+	unsigned short req_operation;
+
+	req_operation = req->operation == BLKIF_OP_INDIRECT ?
+			req->u.indirect.indirect_op : req->operation;
+	if ((req->operation == BLKIF_OP_INDIRECT) &&
+	    (req_operation != BLKIF_OP_READ) &&
+	    (req_operation != BLKIF_OP_WRITE)) {
+		pr_debug(DRV_PFX "Invalid indirect operation (%u)\n",
+			 req_operation);
+		goto fail_response;
+	}
 
-	switch (req->operation) {
+	switch (req_operation) {
 	case BLKIF_OP_READ:
 		blkif->st_rd_req++;
 		operation = READ;
@@ -885,33 +1154,47 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	}
 
 	/* Check that the number of segments is sane. */
-	nseg = req->u.rw.nr_segments;
+	nseg = req->operation == BLKIF_OP_INDIRECT ?
+	       req->u.indirect.nr_segments : req->u.rw.nr_segments;
 
 	if (unlikely(nseg == 0 && operation != WRITE_FLUSH) ||
-	    unlikely(nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
+	    unlikely((req->operation != BLKIF_OP_INDIRECT) &&
+		     (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST)) ||
+	    unlikely((req->operation == BLKIF_OP_INDIRECT) &&
+		     (nseg > MAX_INDIRECT_SEGMENTS))) {
 		pr_debug(DRV_PFX "Bad number of segments in request (%d)\n",
 			 nseg);
 		/* Haven't submitted any bio's yet. */
 		goto fail_response;
 	}
 
-	preq.sector_number = req->u.rw.sector_number;
 	preq.nr_sects      = 0;
 
 	pending_req->blkif     = blkif;
 	pending_req->id        = req->u.rw.id;
-	pending_req->operation = req->operation;
+	pending_req->operation = req_operation;
 	pending_req->status    = BLKIF_RSP_OKAY;
 	pending_req->nr_pages  = nseg;
 
-	for (i = 0; i < nseg; i++) {
-		seg[i].nsec = req->u.rw.seg[i].last_sect -
-			req->u.rw.seg[i].first_sect + 1;
-		if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
-		    (req->u.rw.seg[i].last_sect < req->u.rw.seg[i].first_sect))
+	if (req->operation != BLKIF_OP_INDIRECT) {
+		preq.dev               = req->u.rw.handle;
+		preq.sector_number     = req->u.rw.sector_number;
+		for (i = 0; i < nseg; i++) {
+			pages[i]->gref = req->u.rw.seg[i].gref;
+			seg[i].nsec = req->u.rw.seg[i].last_sect -
+				req->u.rw.seg[i].first_sect + 1;
+			seg[i].offset = (req->u.rw.seg[i].first_sect << 9);
+			if ((req->u.rw.seg[i].last_sect >= (PAGE_SIZE >> 9)) ||
+			    (req->u.rw.seg[i].last_sect <
+			     req->u.rw.seg[i].first_sect))
+				goto fail_response;
+			preq.nr_sects += seg[i].nsec;
+		}
+	} else {
+		preq.dev               = req->u.indirect.handle;
+		preq.sector_number     = req->u.indirect.sector_number;
+		if (xen_blkbk_parse_indirect(req, pending_req, seg, &preq))
 			goto fail_response;
-		preq.nr_sects += seg[i].nsec;
-
 	}
 
 	if (xen_vbd_translate(&preq, blkif, operation) != 0) {
@@ -948,7 +1231,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	 * the hypercall to unmap the grants - that is all done in
 	 * xen_blkbk_unmap.
 	 */
-	if (xen_blkbk_map(req, pending_req, seg, pages))
+	if (xen_blkbk_map_seg(pending_req))
 		goto fail_flush;
 
 	/*
@@ -960,11 +1243,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	for (i = 0; i < nseg; i++) {
 		while ((bio == NULL) ||
 		       (bio_add_page(bio,
-				     pages[i],
+				     pages[i]->page,
 				     seg[i].nsec << 9,
 				     seg[i].offset) == 0)) {
 
-			bio = bio_alloc(GFP_KERNEL, nseg-i);
+			int nr_iovecs = min_t(int, (nseg-i), BIO_MAX_PAGES);
+			bio = bio_alloc(GFP_KERNEL, nr_iovecs);
 			if (unlikely(bio == NULL))
 				goto fail_put_bio;
 
@@ -1009,11 +1293,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif,
 	return 0;
 
  fail_flush:
-	xen_blkbk_unmap(pending_req);
+	xen_blkbk_unmap(blkif, pending_req->segments,
+	                pending_req->nr_pages);
  fail_response:
 	/* Haven't submitted any bio's yet. */
-	make_response(blkif, req->u.rw.id, req->operation, BLKIF_RSP_ERROR);
-	free_req(pending_req);
+	make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR);
+	free_req(blkif, pending_req);
 	msleep(1); /* back off a bit */
 	return -EIO;
 
@@ -1070,73 +1355,20 @@ static void make_response(struct xen_blkif *blkif, u64 id,
 
 static int __init xen_blkif_init(void)
 {
-	int i, mmap_pages;
 	int rc = 0;
 
 	if (!xen_domain())
 		return -ENODEV;
 
-	blkbk = kzalloc(sizeof(struct xen_blkbk), GFP_KERNEL);
-	if (!blkbk) {
-		pr_alert(DRV_PFX "%s: out of memory!\n", __func__);
-		return -ENOMEM;
-	}
-
-	mmap_pages = xen_blkif_reqs * BLKIF_MAX_SEGMENTS_PER_REQUEST;
-
-	blkbk->pending_reqs          = kzalloc(sizeof(blkbk->pending_reqs[0]) *
-					xen_blkif_reqs, GFP_KERNEL);
-	blkbk->pending_grant_handles = kmalloc(sizeof(blkbk->pending_grant_handles[0]) *
-					mmap_pages, GFP_KERNEL);
-	blkbk->pending_pages         = kzalloc(sizeof(blkbk->pending_pages[0]) *
-					mmap_pages, GFP_KERNEL);
-
-	if (!blkbk->pending_reqs || !blkbk->pending_grant_handles ||
-	    !blkbk->pending_pages) {
-		rc = -ENOMEM;
-		goto out_of_memory;
-	}
-
-	for (i = 0; i < mmap_pages; i++) {
-		blkbk->pending_grant_handles[i] = BLKBACK_INVALID_HANDLE;
-		blkbk->pending_pages[i] = alloc_page(GFP_KERNEL);
-		if (blkbk->pending_pages[i] == NULL) {
-			rc = -ENOMEM;
-			goto out_of_memory;
-		}
-	}
 	rc = xen_blkif_interface_init();
 	if (rc)
 		goto failed_init;
 
-	INIT_LIST_HEAD(&blkbk->pending_free);
-	spin_lock_init(&blkbk->pending_free_lock);
-	init_waitqueue_head(&blkbk->pending_free_wq);
-
-	for (i = 0; i < xen_blkif_reqs; i++)
-		list_add_tail(&blkbk->pending_reqs[i].free_list,
-			      &blkbk->pending_free);
-
 	rc = xen_blkif_xenbus_init();
 	if (rc)
 		goto failed_init;
 
-	return 0;
-
- out_of_memory:
-	pr_alert(DRV_PFX "%s: out of memory\n", __func__);
  failed_init:
-	kfree(blkbk->pending_reqs);
-	kfree(blkbk->pending_grant_handles);
-	if (blkbk->pending_pages) {
-		for (i = 0; i < mmap_pages; i++) {
-			if (blkbk->pending_pages[i])
-				__free_page(blkbk->pending_pages[i]);
-		}
-		kfree(blkbk->pending_pages);
-	}
-	kfree(blkbk);
-	blkbk = NULL;
 	return rc;
 }
 
diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h
index 60103e2517ba..8d8807563d99 100644
--- a/drivers/block/xen-blkback/common.h
+++ b/drivers/block/xen-blkback/common.h
@@ -50,6 +50,19 @@
 		 __func__, __LINE__, ##args)
 
 
+/*
+ * This is the maximum number of segments that would be allowed in indirect
+ * requests. This value will also be passed to the frontend.
+ */
+#define MAX_INDIRECT_SEGMENTS 256
+
+#define SEGS_PER_INDIRECT_FRAME \
+	(PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
+#define MAX_INDIRECT_PAGES \
+	((MAX_INDIRECT_SEGMENTS + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
+#define INDIRECT_PAGES(_segs) \
+	((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
+
 /* Not a real protocol.  Used to generate ring structs which contain
  * the elements common to all protocols only.  This way we get a
  * compiler-checkable way to use common struct elements, so we can
@@ -83,12 +96,31 @@ struct blkif_x86_32_request_other {
 	uint64_t       id;           /* private guest value, echoed in resp  */
 } __attribute__((__packed__));
 
+struct blkif_x86_32_request_indirect {
+	uint8_t        indirect_op;
+	uint16_t       nr_segments;
+	uint64_t       id;
+	blkif_sector_t sector_number;
+	blkif_vdev_t   handle;
+	uint16_t       _pad1;
+	grant_ref_t    indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
+	/*
+	 * The maximum number of indirect segments (and pages) that will
+	 * be used is determined by MAX_INDIRECT_SEGMENTS, this value
+	 * is also exported to the guest (via xenstore
+	 * feature-max-indirect-segments entry), so the frontend knows how
+	 * many indirect segments the backend supports.
+	 */
+	uint64_t       _pad2;        /* make it 64 byte aligned */
+} __attribute__((__packed__));
+
 struct blkif_x86_32_request {
 	uint8_t        operation;    /* BLKIF_OP_???                         */
 	union {
 		struct blkif_x86_32_request_rw rw;
 		struct blkif_x86_32_request_discard discard;
 		struct blkif_x86_32_request_other other;
+		struct blkif_x86_32_request_indirect indirect;
 	} u;
 } __attribute__((__packed__));
 
@@ -127,12 +159,32 @@ struct blkif_x86_64_request_other {
 	uint64_t       id;           /* private guest value, echoed in resp  */
 } __attribute__((__packed__));
 
+struct blkif_x86_64_request_indirect {
+	uint8_t        indirect_op;
+	uint16_t       nr_segments;
+	uint32_t       _pad1;        /* offsetof(blkif_..,u.indirect.id)==8   */
+	uint64_t       id;
+	blkif_sector_t sector_number;
+	blkif_vdev_t   handle;
+	uint16_t       _pad2;
+	grant_ref_t    indirect_grefs[BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST];
+	/*
+	 * The maximum number of indirect segments (and pages) that will
+	 * be used is determined by MAX_INDIRECT_SEGMENTS, this value
+	 * is also exported to the guest (via xenstore
+	 * feature-max-indirect-segments entry), so the frontend knows how
+	 * many indirect segments the backend supports.
+	 */
+	uint32_t       _pad3;        /* make it 64 byte aligned */
+} __attribute__((__packed__));
+
 struct blkif_x86_64_request {
 	uint8_t        operation;    /* BLKIF_OP_???                         */
 	union {
 		struct blkif_x86_64_request_rw rw;
 		struct blkif_x86_64_request_discard discard;
 		struct blkif_x86_64_request_other other;
+		struct blkif_x86_64_request_indirect indirect;
 	} u;
 } __attribute__((__packed__));
 
@@ -182,12 +234,26 @@ struct xen_vbd {
 
 struct backend_info;
 
+/* Number of available flags */
+#define PERSISTENT_GNT_FLAGS_SIZE	2
+/* This persistent grant is currently in use */
+#define PERSISTENT_GNT_ACTIVE		0
+/*
+ * This persistent grant has been used, this flag is set when we remove the
+ * PERSISTENT_GNT_ACTIVE, to know that this grant has been used recently.
+ */
+#define PERSISTENT_GNT_WAS_ACTIVE	1
+
+/* Number of requests that we can fit in a ring */
+#define XEN_BLKIF_REQS			32
 
 struct persistent_gnt {
 	struct page *page;
 	grant_ref_t gnt;
 	grant_handle_t handle;
+	DECLARE_BITMAP(flags, PERSISTENT_GNT_FLAGS_SIZE);
 	struct rb_node node;
+	struct list_head remove_node;
 };
 
 struct xen_blkif {
@@ -219,6 +285,23 @@ struct xen_blkif {
 	/* tree to store persistent grants */
 	struct rb_root		persistent_gnts;
 	unsigned int		persistent_gnt_c;
+	atomic_t		persistent_gnt_in_use;
+	unsigned long           next_lru;
+
+	/* used by the kworker that offload work from the persistent purge */
+	struct list_head	persistent_purge_list;
+	struct work_struct	persistent_purge_work;
+
+	/* buffer of free pages to map grant refs */
+	spinlock_t		free_pages_lock;
+	int			free_pages_num;
+	struct list_head	free_pages;
+
+	/* List of all 'pending_req' available */
+	struct list_head	pending_free;
+	/* And its spinlock. */
+	spinlock_t		pending_free_lock;
+	wait_queue_head_t	pending_free_wq;
 
 	/* statistics */
 	unsigned long		st_print;
@@ -231,6 +314,41 @@ struct xen_blkif {
 	unsigned long long			st_wr_sect;
 
 	wait_queue_head_t	waiting_to_free;
+	/* Thread shutdown wait queue. */
+	wait_queue_head_t	shutdown_wq;
+};
+
+struct seg_buf {
+	unsigned long offset;
+	unsigned int nsec;
+};
+
+struct grant_page {
+	struct page 		*page;
+	struct persistent_gnt	*persistent_gnt;
+	grant_handle_t		handle;
+	grant_ref_t		gref;
+};
+
+/*
+ * Each outstanding request that we've passed to the lower device layers has a
+ * 'pending_req' allocated to it. Each buffer_head that completes decrements
+ * the pendcnt towards zero. When it hits zero, the specified domain has a
+ * response queued for it, with the saved 'id' passed back.
+ */
+struct pending_req {
+	struct xen_blkif	*blkif;
+	u64			id;
+	int			nr_pages;
+	atomic_t		pendcnt;
+	unsigned short		operation;
+	int			status;
+	struct list_head	free_list;
+	struct grant_page	*segments[MAX_INDIRECT_SEGMENTS];
+	/* Indirect descriptors */
+	struct grant_page	*indirect_pages[MAX_INDIRECT_PAGES];
+	struct seg_buf		seg[MAX_INDIRECT_SEGMENTS];
+	struct bio		*biolist[MAX_INDIRECT_SEGMENTS];
 };
 
 
@@ -257,6 +375,7 @@ int xen_blkif_xenbus_init(void);
 
 irqreturn_t xen_blkif_be_int(int irq, void *dev_id);
 int xen_blkif_schedule(void *arg);
+int xen_blkif_purge_persistent(void *arg);
 
 int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt,
 			      struct backend_info *be, int state);
@@ -268,7 +387,7 @@ struct xenbus_device *xen_blkbk_xenbus(struct backend_info *be);
 static inline void blkif_get_x86_32_req(struct blkif_request *dst,
 					struct blkif_x86_32_request *src)
 {
-	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
 	dst->operation = src->operation;
 	switch (src->operation) {
 	case BLKIF_OP_READ:
@@ -291,6 +410,18 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
 		dst->u.discard.sector_number = src->u.discard.sector_number;
 		dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
 		break;
+	case BLKIF_OP_INDIRECT:
+		dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
+		dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
+		dst->u.indirect.handle = src->u.indirect.handle;
+		dst->u.indirect.id = src->u.indirect.id;
+		dst->u.indirect.sector_number = src->u.indirect.sector_number;
+		barrier();
+		j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
+		for (i = 0; i < j; i++)
+			dst->u.indirect.indirect_grefs[i] =
+				src->u.indirect.indirect_grefs[i];
+		break;
 	default:
 		/*
 		 * Don't know how to translate this op. Only get the
@@ -304,7 +435,7 @@ static inline void blkif_get_x86_32_req(struct blkif_request *dst,
 static inline void blkif_get_x86_64_req(struct blkif_request *dst,
 					struct blkif_x86_64_request *src)
 {
-	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	int i, n = BLKIF_MAX_SEGMENTS_PER_REQUEST, j;
 	dst->operation = src->operation;
 	switch (src->operation) {
 	case BLKIF_OP_READ:
@@ -327,6 +458,18 @@ static inline void blkif_get_x86_64_req(struct blkif_request *dst,
 		dst->u.discard.sector_number = src->u.discard.sector_number;
 		dst->u.discard.nr_sectors = src->u.discard.nr_sectors;
 		break;
+	case BLKIF_OP_INDIRECT:
+		dst->u.indirect.indirect_op = src->u.indirect.indirect_op;
+		dst->u.indirect.nr_segments = src->u.indirect.nr_segments;
+		dst->u.indirect.handle = src->u.indirect.handle;
+		dst->u.indirect.id = src->u.indirect.id;
+		dst->u.indirect.sector_number = src->u.indirect.sector_number;
+		barrier();
+		j = min(MAX_INDIRECT_PAGES, INDIRECT_PAGES(dst->u.indirect.nr_segments));
+		for (i = 0; i < j; i++)
+			dst->u.indirect.indirect_grefs[i] =
+				src->u.indirect.indirect_grefs[i];
+		break;
 	default:
 		/*
 		 * Don't know how to translate this op. Only get the
diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c
index 04608a6502d7..fe5c3cd10c34 100644
--- a/drivers/block/xen-blkback/xenbus.c
+++ b/drivers/block/xen-blkback/xenbus.c
@@ -98,12 +98,17 @@ static void xen_update_blkif_status(struct xen_blkif *blkif)
 		err = PTR_ERR(blkif->xenblkd);
 		blkif->xenblkd = NULL;
 		xenbus_dev_error(blkif->be->dev, err, "start xenblkd");
+		return;
 	}
 }
 
 static struct xen_blkif *xen_blkif_alloc(domid_t domid)
 {
 	struct xen_blkif *blkif;
+	struct pending_req *req, *n;
+	int i, j;
+
+	BUILD_BUG_ON(MAX_INDIRECT_PAGES > BLKIF_MAX_INDIRECT_PAGES_PER_REQUEST);
 
 	blkif = kmem_cache_zalloc(xen_blkif_cachep, GFP_KERNEL);
 	if (!blkif)
@@ -118,8 +123,57 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid)
 	blkif->st_print = jiffies;
 	init_waitqueue_head(&blkif->waiting_to_free);
 	blkif->persistent_gnts.rb_node = NULL;
+	spin_lock_init(&blkif->free_pages_lock);
+	INIT_LIST_HEAD(&blkif->free_pages);
+	blkif->free_pages_num = 0;
+	atomic_set(&blkif->persistent_gnt_in_use, 0);
+
+	INIT_LIST_HEAD(&blkif->pending_free);
+
+	for (i = 0; i < XEN_BLKIF_REQS; i++) {
+		req = kzalloc(sizeof(*req), GFP_KERNEL);
+		if (!req)
+			goto fail;
+		list_add_tail(&req->free_list,
+		              &blkif->pending_free);
+		for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
+			req->segments[j] = kzalloc(sizeof(*req->segments[0]),
+			                           GFP_KERNEL);
+			if (!req->segments[j])
+				goto fail;
+		}
+		for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
+			req->indirect_pages[j] = kzalloc(sizeof(*req->indirect_pages[0]),
+			                                 GFP_KERNEL);
+			if (!req->indirect_pages[j])
+				goto fail;
+		}
+	}
+	spin_lock_init(&blkif->pending_free_lock);
+	init_waitqueue_head(&blkif->pending_free_wq);
+	init_waitqueue_head(&blkif->shutdown_wq);
 
 	return blkif;
+
+fail:
+	list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
+		list_del(&req->free_list);
+		for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) {
+			if (!req->segments[j])
+				break;
+			kfree(req->segments[j]);
+		}
+		for (j = 0; j < MAX_INDIRECT_PAGES; j++) {
+			if (!req->indirect_pages[j])
+				break;
+			kfree(req->indirect_pages[j]);
+		}
+		kfree(req);
+	}
+
+	kmem_cache_free(xen_blkif_cachep, blkif);
+
+	return ERR_PTR(-ENOMEM);
 }
 
 static int xen_blkif_map(struct xen_blkif *blkif, unsigned long shared_page,
@@ -178,6 +232,7 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif)
 {
 	if (blkif->xenblkd) {
 		kthread_stop(blkif->xenblkd);
+		wake_up(&blkif->shutdown_wq);
 		blkif->xenblkd = NULL;
 	}
 
@@ -198,8 +253,28 @@ static void xen_blkif_disconnect(struct xen_blkif *blkif)
 
 static void xen_blkif_free(struct xen_blkif *blkif)
 {
+	struct pending_req *req, *n;
+	int i = 0, j;
+
 	if (!atomic_dec_and_test(&blkif->refcnt))
 		BUG();
+
+	/* Check that there is no request in use */
+	list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) {
+		list_del(&req->free_list);
+
+		for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++)
+			kfree(req->segments[j]);
+
+		for (j = 0; j < MAX_INDIRECT_PAGES; j++)
+			kfree(req->indirect_pages[j]);
+
+		kfree(req);
+		i++;
+	}
+
+	WARN_ON(i != XEN_BLKIF_REQS);
+
 	kmem_cache_free(xen_blkif_cachep, blkif);
 }
 
@@ -678,6 +753,11 @@ again:
 				 dev->nodename);
 		goto abort;
 	}
+	err = xenbus_printf(xbt, dev->nodename, "feature-max-indirect-segments", "%u",
+			    MAX_INDIRECT_SEGMENTS);
+	if (err)
+		dev_warn(&dev->dev, "writing %s/feature-max-indirect-segments (%d)",
+			 dev->nodename, err);
 
 	err = xenbus_printf(xbt, dev->nodename, "sectors", "%llu",
 			    (unsigned long long)vbd_sz(&be->blkif->vbd));
@@ -704,6 +784,11 @@ again:
 				 dev->nodename);
 		goto abort;
 	}
+	err = xenbus_printf(xbt, dev->nodename, "physical-sector-size", "%u",
+			    bdev_physical_block_size(be->blkif->vbd.bdev));
+	if (err)
+		xenbus_dev_error(dev, err, "writing %s/physical-sector-size",
+				 dev->nodename);
 
 	err = xenbus_transaction_end(xbt, 0);
 	if (err == -EAGAIN)
diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c
index d89ef86220f4..a4660bbee8a6 100644
--- a/drivers/block/xen-blkfront.c
+++ b/drivers/block/xen-blkfront.c
@@ -74,12 +74,30 @@ struct grant {
 struct blk_shadow {
 	struct blkif_request req;
 	struct request *request;
-	struct grant *grants_used[BLKIF_MAX_SEGMENTS_PER_REQUEST];
+	struct grant **grants_used;
+	struct grant **indirect_grants;
+	struct scatterlist *sg;
+};
+
+struct split_bio {
+	struct bio *bio;
+	atomic_t pending;
+	int err;
 };
 
 static DEFINE_MUTEX(blkfront_mutex);
 static const struct block_device_operations xlvbd_block_fops;
 
+/*
+ * Maximum number of segments in indirect requests, the actual value used by
+ * the frontend driver is the minimum of this value and the value provided
+ * by the backend driver.
+ */
+
+static unsigned int xen_blkif_max_segments = 32;
+module_param_named(max, xen_blkif_max_segments, int, S_IRUGO);
+MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)");
+
 #define BLK_RING_SIZE __CONST_RING_SIZE(blkif, PAGE_SIZE)
 
 /*
@@ -98,7 +116,6 @@ struct blkfront_info
 	enum blkif_state connected;
 	int ring_ref;
 	struct blkif_front_ring ring;
-	struct scatterlist sg[BLKIF_MAX_SEGMENTS_PER_REQUEST];
 	unsigned int evtchn, irq;
 	struct request_queue *rq;
 	struct work_struct work;
@@ -114,6 +131,7 @@ struct blkfront_info
 	unsigned int discard_granularity;
 	unsigned int discard_alignment;
 	unsigned int feature_persistent:1;
+	unsigned int max_indirect_segments;
 	int is_ready;
 };
 
@@ -142,6 +160,13 @@ static DEFINE_SPINLOCK(minor_lock);
 
 #define DEV_NAME	"xvd"	/* name in /dev */
 
+#define SEGS_PER_INDIRECT_FRAME \
+	(PAGE_SIZE/sizeof(struct blkif_request_segment_aligned))
+#define INDIRECT_GREFS(_segs) \
+	((_segs + SEGS_PER_INDIRECT_FRAME - 1)/SEGS_PER_INDIRECT_FRAME)
+
+static int blkfront_setup_indirect(struct blkfront_info *info);
+
 static int get_id_from_freelist(struct blkfront_info *info)
 {
 	unsigned long free = info->shadow_free;
@@ -358,7 +383,8 @@ static int blkif_queue_request(struct request *req)
 	struct blkif_request *ring_req;
 	unsigned long id;
 	unsigned int fsect, lsect;
-	int i, ref;
+	int i, ref, n;
+	struct blkif_request_segment_aligned *segments = NULL;
 
 	/*
 	 * Used to store if we are able to queue the request by just using
@@ -369,21 +395,27 @@ static int blkif_queue_request(struct request *req)
 	grant_ref_t gref_head;
 	struct grant *gnt_list_entry = NULL;
 	struct scatterlist *sg;
+	int nseg, max_grefs;
 
 	if (unlikely(info->connected != BLKIF_STATE_CONNECTED))
 		return 1;
 
-	/* Check if we have enought grants to allocate a requests */
-	if (info->persistent_gnts_c < BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+	max_grefs = info->max_indirect_segments ?
+		    info->max_indirect_segments +
+		    INDIRECT_GREFS(info->max_indirect_segments) :
+		    BLKIF_MAX_SEGMENTS_PER_REQUEST;
+
+	/* Check if we have enough grants to allocate a requests */
+	if (info->persistent_gnts_c < max_grefs) {
 		new_persistent_gnts = 1;
 		if (gnttab_alloc_grant_references(
-		    BLKIF_MAX_SEGMENTS_PER_REQUEST - info->persistent_gnts_c,
+		    max_grefs - info->persistent_gnts_c,
 		    &gref_head) < 0) {
 			gnttab_request_free_callback(
 				&info->callback,
 				blkif_restart_queue_callback,
 				info,
-				BLKIF_MAX_SEGMENTS_PER_REQUEST);
+				max_grefs);
 			return 1;
 		}
 	} else
@@ -394,42 +426,67 @@ static int blkif_queue_request(struct request *req)
 	id = get_id_from_freelist(info);
 	info->shadow[id].request = req;
 
-	ring_req->u.rw.id = id;
-	ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
-	ring_req->u.rw.handle = info->handle;
-
-	ring_req->operation = rq_data_dir(req) ?
-		BLKIF_OP_WRITE : BLKIF_OP_READ;
-
-	if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
-		/*
-		 * Ideally we can do an unordered flush-to-disk. In case the
-		 * backend onlysupports barriers, use that. A barrier request
-		 * a superset of FUA, so we can implement it the same
-		 * way.  (It's also a FLUSH+FUA, since it is
-		 * guaranteed ordered WRT previous writes.)
-		 */
-		ring_req->operation = info->flush_op;
-	}
-
 	if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) {
-		/* id, sector_number and handle are set above. */
 		ring_req->operation = BLKIF_OP_DISCARD;
 		ring_req->u.discard.nr_sectors = blk_rq_sectors(req);
+		ring_req->u.discard.id = id;
+		ring_req->u.discard.sector_number = (blkif_sector_t)blk_rq_pos(req);
 		if ((req->cmd_flags & REQ_SECURE) && info->feature_secdiscard)
 			ring_req->u.discard.flag = BLKIF_DISCARD_SECURE;
 		else
 			ring_req->u.discard.flag = 0;
 	} else {
-		ring_req->u.rw.nr_segments = blk_rq_map_sg(req->q, req,
-							   info->sg);
-		BUG_ON(ring_req->u.rw.nr_segments >
-		       BLKIF_MAX_SEGMENTS_PER_REQUEST);
-
-		for_each_sg(info->sg, sg, ring_req->u.rw.nr_segments, i) {
+		BUG_ON(info->max_indirect_segments == 0 &&
+		       req->nr_phys_segments > BLKIF_MAX_SEGMENTS_PER_REQUEST);
+		BUG_ON(info->max_indirect_segments &&
+		       req->nr_phys_segments > info->max_indirect_segments);
+		nseg = blk_rq_map_sg(req->q, req, info->shadow[id].sg);
+		ring_req->u.rw.id = id;
+		if (nseg > BLKIF_MAX_SEGMENTS_PER_REQUEST) {
+			/*
+			 * The indirect operation can only be a BLKIF_OP_READ or
+			 * BLKIF_OP_WRITE
+			 */
+			BUG_ON(req->cmd_flags & (REQ_FLUSH | REQ_FUA));
+			ring_req->operation = BLKIF_OP_INDIRECT;
+			ring_req->u.indirect.indirect_op = rq_data_dir(req) ?
+				BLKIF_OP_WRITE : BLKIF_OP_READ;
+			ring_req->u.indirect.sector_number = (blkif_sector_t)blk_rq_pos(req);
+			ring_req->u.indirect.handle = info->handle;
+			ring_req->u.indirect.nr_segments = nseg;
+		} else {
+			ring_req->u.rw.sector_number = (blkif_sector_t)blk_rq_pos(req);
+			ring_req->u.rw.handle = info->handle;
+			ring_req->operation = rq_data_dir(req) ?
+				BLKIF_OP_WRITE : BLKIF_OP_READ;
+			if (req->cmd_flags & (REQ_FLUSH | REQ_FUA)) {
+				/*
+				 * Ideally we can do an unordered flush-to-disk. In case the
+				 * backend onlysupports barriers, use that. A barrier request
+				 * a superset of FUA, so we can implement it the same
+				 * way.  (It's also a FLUSH+FUA, since it is
+				 * guaranteed ordered WRT previous writes.)
+				 */
+				ring_req->operation = info->flush_op;
+			}
+			ring_req->u.rw.nr_segments = nseg;
+		}
+		for_each_sg(info->shadow[id].sg, sg, nseg, i) {
 			fsect = sg->offset >> 9;
 			lsect = fsect + (sg->length >> 9) - 1;
 
+			if ((ring_req->operation == BLKIF_OP_INDIRECT) &&
+			    (i % SEGS_PER_INDIRECT_FRAME == 0)) {
+				if (segments)
+					kunmap_atomic(segments);
+
+				n = i / SEGS_PER_INDIRECT_FRAME;
+				gnt_list_entry = get_grant(&gref_head, info);
+				info->shadow[id].indirect_grants[n] = gnt_list_entry;
+				segments = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
+				ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref;
+			}
+
 			gnt_list_entry = get_grant(&gref_head, info);
 			ref = gnt_list_entry->gref;
 
@@ -441,8 +498,7 @@ static int blkif_queue_request(struct request *req)
 
 				BUG_ON(sg->offset + sg->length > PAGE_SIZE);
 
-				shared_data = kmap_atomic(
-					pfn_to_page(gnt_list_entry->pfn));
+				shared_data = kmap_atomic(pfn_to_page(gnt_list_entry->pfn));
 				bvec_data = kmap_atomic(sg_page(sg));
 
 				/*
@@ -461,13 +517,23 @@ static int blkif_queue_request(struct request *req)
 				kunmap_atomic(bvec_data);
 				kunmap_atomic(shared_data);
 			}
-
-			ring_req->u.rw.seg[i] =
-					(struct blkif_request_segment) {
-						.gref       = ref,
-						.first_sect = fsect,
-						.last_sect  = lsect };
+			if (ring_req->operation != BLKIF_OP_INDIRECT) {
+				ring_req->u.rw.seg[i] =
+						(struct blkif_request_segment) {
+							.gref       = ref,
+							.first_sect = fsect,
+							.last_sect  = lsect };
+			} else {
+				n = i % SEGS_PER_INDIRECT_FRAME;
+				segments[n] =
+					(struct blkif_request_segment_aligned) {
+							.gref       = ref,
+							.first_sect = fsect,
+							.last_sect  = lsect };
+			}
 		}
+		if (segments)
+			kunmap_atomic(segments);
 	}
 
 	info->ring.req_prod_pvt++;
@@ -542,7 +608,9 @@ wait:
 		flush_requests(info);
 }
 
-static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
+static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size,
+				unsigned int physical_sector_size,
+				unsigned int segments)
 {
 	struct request_queue *rq;
 	struct blkfront_info *info = gd->private_data;
@@ -564,14 +632,15 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 
 	/* Hard sector size and max sectors impersonate the equiv. hardware. */
 	blk_queue_logical_block_size(rq, sector_size);
-	blk_queue_max_hw_sectors(rq, 512);
+	blk_queue_physical_block_size(rq, physical_sector_size);
+	blk_queue_max_hw_sectors(rq, (segments * PAGE_SIZE) / 512);
 
 	/* Each segment in a request is up to an aligned page in size. */
 	blk_queue_segment_boundary(rq, PAGE_SIZE - 1);
 	blk_queue_max_segment_size(rq, PAGE_SIZE);
 
 	/* Ensure a merged request will fit in a single I/O ring slot. */
-	blk_queue_max_segments(rq, BLKIF_MAX_SEGMENTS_PER_REQUEST);
+	blk_queue_max_segments(rq, segments);
 
 	/* Make sure buffer addresses are sector-aligned. */
 	blk_queue_dma_alignment(rq, 511);
@@ -588,13 +657,16 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size)
 static void xlvbd_flush(struct blkfront_info *info)
 {
 	blk_queue_flush(info->rq, info->feature_flush);
-	printk(KERN_INFO "blkfront: %s: %s: %s %s\n",
+	printk(KERN_INFO "blkfront: %s: %s: %s %s %s %s %s\n",
 	       info->gd->disk_name,
 	       info->flush_op == BLKIF_OP_WRITE_BARRIER ?
 		"barrier" : (info->flush_op == BLKIF_OP_FLUSH_DISKCACHE ?
 		"flush diskcache" : "barrier or flush"),
-	       info->feature_flush ? "enabled" : "disabled",
-	       info->feature_persistent ? "using persistent grants" : "");
+	       info->feature_flush ? "enabled;" : "disabled;",
+	       "persistent grants:",
+	       info->feature_persistent ? "enabled;" : "disabled;",
+	       "indirect descriptors:",
+	       info->max_indirect_segments ? "enabled;" : "disabled;");
 }
 
 static int xen_translate_vdev(int vdevice, int *minor, unsigned int *offset)
@@ -667,7 +739,8 @@ static char *encode_disk_name(char *ptr, unsigned int n)
 
 static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 			       struct blkfront_info *info,
-			       u16 vdisk_info, u16 sector_size)
+			       u16 vdisk_info, u16 sector_size,
+			       unsigned int physical_sector_size)
 {
 	struct gendisk *gd;
 	int nr_minors = 1;
@@ -734,7 +807,9 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity,
 	gd->driverfs_dev = &(info->xbdev->dev);
 	set_capacity(gd, capacity);
 
-	if (xlvbd_init_blk_queue(gd, sector_size)) {
+	if (xlvbd_init_blk_queue(gd, sector_size, physical_sector_size,
+				 info->max_indirect_segments ? :
+				 BLKIF_MAX_SEGMENTS_PER_REQUEST)) {
 		del_gendisk(gd);
 		goto release;
 	}
@@ -818,6 +893,7 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 {
 	struct grant *persistent_gnt;
 	struct grant *n;
+	int i, j, segs;
 
 	/* Prevent new requests being issued until we fix things up. */
 	spin_lock_irq(&info->io_lock);
@@ -843,6 +919,47 @@ static void blkif_free(struct blkfront_info *info, int suspend)
 	}
 	BUG_ON(info->persistent_gnts_c != 0);
 
+	for (i = 0; i < BLK_RING_SIZE; i++) {
+		/*
+		 * Clear persistent grants present in requests already
+		 * on the shared ring
+		 */
+		if (!info->shadow[i].request)
+			goto free_shadow;
+
+		segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ?
+		       info->shadow[i].req.u.indirect.nr_segments :
+		       info->shadow[i].req.u.rw.nr_segments;
+		for (j = 0; j < segs; j++) {
+			persistent_gnt = info->shadow[i].grants_used[j];
+			gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
+			__free_page(pfn_to_page(persistent_gnt->pfn));
+			kfree(persistent_gnt);
+		}
+
+		if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT)
+			/*
+			 * If this is not an indirect operation don't try to
+			 * free indirect segments
+			 */
+			goto free_shadow;
+
+		for (j = 0; j < INDIRECT_GREFS(segs); j++) {
+			persistent_gnt = info->shadow[i].indirect_grants[j];
+			gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL);
+			__free_page(pfn_to_page(persistent_gnt->pfn));
+			kfree(persistent_gnt);
+		}
+
+free_shadow:
+		kfree(info->shadow[i].grants_used);
+		info->shadow[i].grants_used = NULL;
+		kfree(info->shadow[i].indirect_grants);
+		info->shadow[i].indirect_grants = NULL;
+		kfree(info->shadow[i].sg);
+		info->shadow[i].sg = NULL;
+	}
+
 	/* No more gnttab callback work. */
 	gnttab_cancel_free_callback(&info->callback);
 	spin_unlock_irq(&info->io_lock);
@@ -867,12 +984,13 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
 			     struct blkif_response *bret)
 {
 	int i = 0;
-	struct bio_vec *bvec;
-	struct req_iterator iter;
-	unsigned long flags;
+	struct scatterlist *sg;
 	char *bvec_data;
 	void *shared_data;
-	unsigned int offset = 0;
+	int nseg;
+
+	nseg = s->req.operation == BLKIF_OP_INDIRECT ?
+		s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments;
 
 	if (bret->operation == BLKIF_OP_READ) {
 		/*
@@ -881,26 +999,29 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info,
 		 * than PAGE_SIZE, we have to keep track of the current offset,
 		 * to be sure we are copying the data from the right shared page.
 		 */
-		rq_for_each_segment(bvec, s->request, iter) {
-			BUG_ON((bvec->bv_offset + bvec->bv_len) > PAGE_SIZE);
-			if (bvec->bv_offset < offset)
-				i++;
-			BUG_ON(i >= s->req.u.rw.nr_segments);
+		for_each_sg(s->sg, sg, nseg, i) {
+			BUG_ON(sg->offset + sg->length > PAGE_SIZE);
 			shared_data = kmap_atomic(
 				pfn_to_page(s->grants_used[i]->pfn));
-			bvec_data = bvec_kmap_irq(bvec, &flags);
-			memcpy(bvec_data, shared_data + bvec->bv_offset,
-				bvec->bv_len);
-			bvec_kunmap_irq(bvec_data, &flags);
+			bvec_data = kmap_atomic(sg_page(sg));
+			memcpy(bvec_data   + sg->offset,
+			       shared_data + sg->offset,
+			       sg->length);
+			kunmap_atomic(bvec_data);
 			kunmap_atomic(shared_data);
-			offset = bvec->bv_offset + bvec->bv_len;
 		}
 	}
 	/* Add the persistent grant into the list of free grants */
-	for (i = 0; i < s->req.u.rw.nr_segments; i++) {
+	for (i = 0; i < nseg; i++) {
 		list_add(&s->grants_used[i]->node, &info->persistent_gnts);
 		info->persistent_gnts_c++;
 	}
+	if (s->req.operation == BLKIF_OP_INDIRECT) {
+		for (i = 0; i < INDIRECT_GREFS(nseg); i++) {
+			list_add(&s->indirect_grants[i]->node, &info->persistent_gnts);
+			info->persistent_gnts_c++;
+		}
+	}
 }
 
 static irqreturn_t blkif_interrupt(int irq, void *dev_id)
@@ -1034,14 +1155,6 @@ static int setup_blkring(struct xenbus_device *dev,
 	SHARED_RING_INIT(sring);
 	FRONT_RING_INIT(&info->ring, sring, PAGE_SIZE);
 
-	sg_init_table(info->sg, BLKIF_MAX_SEGMENTS_PER_REQUEST);
-
-	/* Allocate memory for grants */
-	err = fill_grant_buffer(info, BLK_RING_SIZE *
-	                              BLKIF_MAX_SEGMENTS_PER_REQUEST);
-	if (err)
-		goto fail;
-
 	err = xenbus_grant_ring(dev, virt_to_mfn(info->ring.sring));
 	if (err < 0) {
 		free_page((unsigned long)sring);
@@ -1223,13 +1336,84 @@ static int blkfront_probe(struct xenbus_device *dev,
 	return 0;
 }
 
+/*
+ * This is a clone of md_trim_bio, used to split a bio into smaller ones
+ */
+static void trim_bio(struct bio *bio, int offset, int size)
+{
+	/* 'bio' is a cloned bio which we need to trim to match
+	 * the given offset and size.
+	 * This requires adjusting bi_sector, bi_size, and bi_io_vec
+	 */
+	int i;
+	struct bio_vec *bvec;
+	int sofar = 0;
+
+	size <<= 9;
+	if (offset == 0 && size == bio->bi_size)
+		return;
+
+	bio->bi_sector += offset;
+	bio->bi_size = size;
+	offset <<= 9;
+	clear_bit(BIO_SEG_VALID, &bio->bi_flags);
+
+	while (bio->bi_idx < bio->bi_vcnt &&
+	       bio->bi_io_vec[bio->bi_idx].bv_len <= offset) {
+		/* remove this whole bio_vec */
+		offset -= bio->bi_io_vec[bio->bi_idx].bv_len;
+		bio->bi_idx++;
+	}
+	if (bio->bi_idx < bio->bi_vcnt) {
+		bio->bi_io_vec[bio->bi_idx].bv_offset += offset;
+		bio->bi_io_vec[bio->bi_idx].bv_len -= offset;
+	}
+	/* avoid any complications with bi_idx being non-zero*/
+	if (bio->bi_idx) {
+		memmove(bio->bi_io_vec, bio->bi_io_vec+bio->bi_idx,
+			(bio->bi_vcnt - bio->bi_idx) * sizeof(struct bio_vec));
+		bio->bi_vcnt -= bio->bi_idx;
+		bio->bi_idx = 0;
+	}
+	/* Make sure vcnt and last bv are not too big */
+	bio_for_each_segment(bvec, bio, i) {
+		if (sofar + bvec->bv_len > size)
+			bvec->bv_len = size - sofar;
+		if (bvec->bv_len == 0) {
+			bio->bi_vcnt = i;
+			break;
+		}
+		sofar += bvec->bv_len;
+	}
+}
+
+static void split_bio_end(struct bio *bio, int error)
+{
+	struct split_bio *split_bio = bio->bi_private;
+
+	if (error)
+		split_bio->err = error;
+
+	if (atomic_dec_and_test(&split_bio->pending)) {
+		split_bio->bio->bi_phys_segments = 0;
+		bio_endio(split_bio->bio, split_bio->err);
+		kfree(split_bio);
+	}
+	bio_put(bio);
+}
 
 static int blkif_recover(struct blkfront_info *info)
 {
 	int i;
-	struct blkif_request *req;
+	struct request *req, *n;
 	struct blk_shadow *copy;
-	int j;
+	int rc;
+	struct bio *bio, *cloned_bio;
+	struct bio_list bio_list, merge_bio;
+	unsigned int segs, offset;
+	int pending, size;
+	struct split_bio *split_bio;
+	struct list_head requests;
 
 	/* Stage 1: Make a safe copy of the shadow state. */
 	copy = kmemdup(info->shadow, sizeof(info->shadow),
@@ -1244,36 +1428,64 @@ static int blkif_recover(struct blkfront_info *info)
 	info->shadow_free = info->ring.req_prod_pvt;
 	info->shadow[BLK_RING_SIZE-1].req.u.rw.id = 0x0fffffff;
 
-	/* Stage 3: Find pending requests and requeue them. */
+	rc = blkfront_setup_indirect(info);
+	if (rc) {
+		kfree(copy);
+		return rc;
+	}
+
+	segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	blk_queue_max_segments(info->rq, segs);
+	bio_list_init(&bio_list);
+	INIT_LIST_HEAD(&requests);
 	for (i = 0; i < BLK_RING_SIZE; i++) {
 		/* Not in use? */
 		if (!copy[i].request)
 			continue;
 
-		/* Grab a request slot and copy shadow state into it. */
-		req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt);
-		*req = copy[i].req;
-
-		/* We get a new request id, and must reset the shadow state. */
-		req->u.rw.id = get_id_from_freelist(info);
-		memcpy(&info->shadow[req->u.rw.id], &copy[i], sizeof(copy[i]));
-
-		if (req->operation != BLKIF_OP_DISCARD) {
-		/* Rewrite any grant references invalidated by susp/resume. */
-			for (j = 0; j < req->u.rw.nr_segments; j++)
-				gnttab_grant_foreign_access_ref(
-					req->u.rw.seg[j].gref,
-					info->xbdev->otherend_id,
-					pfn_to_mfn(copy[i].grants_used[j]->pfn),
-					0);
+		/*
+		 * Get the bios in the request so we can re-queue them.
+		 */
+		if (copy[i].request->cmd_flags &
+		    (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+			/*
+			 * Flush operations don't contain bios, so
+			 * we need to requeue the whole request
+			 */
+			list_add(&copy[i].request->queuelist, &requests);
+			continue;
 		}
-		info->shadow[req->u.rw.id].req = *req;
-
-		info->ring.req_prod_pvt++;
+		merge_bio.head = copy[i].request->bio;
+		merge_bio.tail = copy[i].request->biotail;
+		bio_list_merge(&bio_list, &merge_bio);
+		copy[i].request->bio = NULL;
+		blk_put_request(copy[i].request);
 	}
 
 	kfree(copy);
 
+	/*
+	 * Empty the queue, this is important because we might have
+	 * requests in the queue with more segments than what we
+	 * can handle now.
+	 */
+	spin_lock_irq(&info->io_lock);
+	while ((req = blk_fetch_request(info->rq)) != NULL) {
+		if (req->cmd_flags &
+		    (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) {
+			list_add(&req->queuelist, &requests);
+			continue;
+		}
+		merge_bio.head = req->bio;
+		merge_bio.tail = req->biotail;
+		bio_list_merge(&bio_list, &merge_bio);
+		req->bio = NULL;
+		if (req->cmd_flags & (REQ_FLUSH | REQ_FUA))
+			pr_alert("diskcache flush request found!\n");
+		__blk_put_request(info->rq, req);
+	}
+	spin_unlock_irq(&info->io_lock);
+
 	xenbus_switch_state(info->xbdev, XenbusStateConnected);
 
 	spin_lock_irq(&info->io_lock);
@@ -1281,14 +1493,50 @@ static int blkif_recover(struct blkfront_info *info)
 	/* Now safe for us to use the shared ring */
 	info->connected = BLKIF_STATE_CONNECTED;
 
-	/* Send off requeued requests */
-	flush_requests(info);
-
 	/* Kick any other new requests queued since we resumed */
 	kick_pending_request_queues(info);
 
+	list_for_each_entry_safe(req, n, &requests, queuelist) {
+		/* Requeue pending requests (flush or discard) */
+		list_del_init(&req->queuelist);
+		BUG_ON(req->nr_phys_segments > segs);
+		blk_requeue_request(info->rq, req);
+	}
 	spin_unlock_irq(&info->io_lock);
 
+	while ((bio = bio_list_pop(&bio_list)) != NULL) {
+		/* Traverse the list of pending bios and re-queue them */
+		if (bio_segments(bio) > segs) {
+			/*
+			 * This bio has more segments than what we can
+			 * handle, we have to split it.
+			 */
+			pending = (bio_segments(bio) + segs - 1) / segs;
+			split_bio = kzalloc(sizeof(*split_bio), GFP_NOIO);
+			BUG_ON(split_bio == NULL);
+			atomic_set(&split_bio->pending, pending);
+			split_bio->bio = bio;
+			for (i = 0; i < pending; i++) {
+				offset = (i * segs * PAGE_SIZE) >> 9;
+				size = min((unsigned int)(segs * PAGE_SIZE) >> 9,
+					   (unsigned int)(bio->bi_size >> 9) - offset);
+				cloned_bio = bio_clone(bio, GFP_NOIO);
+				BUG_ON(cloned_bio == NULL);
+				trim_bio(cloned_bio, offset, size);
+				cloned_bio->bi_private = split_bio;
+				cloned_bio->bi_end_io = split_bio_end;
+				submit_bio(cloned_bio->bi_rw, cloned_bio);
+			}
+			/*
+			 * Now we have to wait for all those smaller bios to
+			 * end, so we can also end the "parent" bio.
+			 */
+			continue;
+		}
+		/* We don't need to split this bio */
+		submit_bio(bio->bi_rw, bio);
+	}
+
 	return 0;
 }
 
@@ -1308,8 +1556,12 @@ static int blkfront_resume(struct xenbus_device *dev)
 	blkif_free(info, info->connected == BLKIF_STATE_CONNECTED);
 
 	err = talk_to_blkback(dev, info);
-	if (info->connected == BLKIF_STATE_SUSPENDED && !err)
-		err = blkif_recover(info);
+
+	/*
+	 * We have to wait for the backend to switch to
+	 * connected state, since we want to read which
+	 * features it supports.
+	 */
 
 	return err;
 }
@@ -1387,6 +1639,60 @@ static void blkfront_setup_discard(struct blkfront_info *info)
 	kfree(type);
 }
 
+static int blkfront_setup_indirect(struct blkfront_info *info)
+{
+	unsigned int indirect_segments, segs;
+	int err, i;
+
+	err = xenbus_gather(XBT_NIL, info->xbdev->otherend,
+			    "feature-max-indirect-segments", "%u", &indirect_segments,
+			    NULL);
+	if (err) {
+		info->max_indirect_segments = 0;
+		segs = BLKIF_MAX_SEGMENTS_PER_REQUEST;
+	} else {
+		info->max_indirect_segments = min(indirect_segments,
+						  xen_blkif_max_segments);
+		segs = info->max_indirect_segments;
+	}
+
+	err = fill_grant_buffer(info, (segs + INDIRECT_GREFS(segs)) * BLK_RING_SIZE);
+	if (err)
+		goto out_of_memory;
+
+	for (i = 0; i < BLK_RING_SIZE; i++) {
+		info->shadow[i].grants_used = kzalloc(
+			sizeof(info->shadow[i].grants_used[0]) * segs,
+			GFP_NOIO);
+		info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * segs, GFP_NOIO);
+		if (info->max_indirect_segments)
+			info->shadow[i].indirect_grants = kzalloc(
+				sizeof(info->shadow[i].indirect_grants[0]) *
+				INDIRECT_GREFS(segs),
+				GFP_NOIO);
+		if ((info->shadow[i].grants_used == NULL) ||
+			(info->shadow[i].sg == NULL) ||
+		     (info->max_indirect_segments &&
+		     (info->shadow[i].indirect_grants == NULL)))
+			goto out_of_memory;
+		sg_init_table(info->shadow[i].sg, segs);
+	}
+
+
+	return 0;
+
+out_of_memory:
+	for (i = 0; i < BLK_RING_SIZE; i++) {
+		kfree(info->shadow[i].grants_used);
+		info->shadow[i].grants_used = NULL;
+		kfree(info->shadow[i].sg);
+		info->shadow[i].sg = NULL;
+		kfree(info->shadow[i].indirect_grants);
+		info->shadow[i].indirect_grants = NULL;
+	}
+	return -ENOMEM;
+}
+
 /*
  * Invoked when the backend is finally 'ready' (and has told produced
  * the details about the physical device - #sectors, size, etc).
@@ -1395,6 +1701,7 @@ static void blkfront_connect(struct blkfront_info *info)
 {
 	unsigned long long sectors;
 	unsigned long sector_size;
+	unsigned int physical_sector_size;
 	unsigned int binfo;
 	int err;
 	int barrier, flush, discard, persistent;
@@ -1414,8 +1721,15 @@ static void blkfront_connect(struct blkfront_info *info)
 		set_capacity(info->gd, sectors);
 		revalidate_disk(info->gd);
 
-		/* fall through */
+		return;
 	case BLKIF_STATE_SUSPENDED:
+		/*
+		 * If we are recovering from suspension, we need to wait
+		 * for the backend to announce it's features before
+		 * reconnecting, at least we need to know if the backend
+		 * supports indirect descriptors, and how many.
+		 */
+		blkif_recover(info);
 		return;
 
 	default:
@@ -1437,6 +1751,16 @@ static void blkfront_connect(struct blkfront_info *info)
 		return;
 	}
 
+	/*
+	 * physcial-sector-size is a newer field, so old backends may not
+	 * provide this. Assume physical sector size to be the same as
+	 * sector_size in that case.
+	 */
+	err = xenbus_scanf(XBT_NIL, info->xbdev->otherend,
+			   "physical-sector-size", "%u", &physical_sector_size);
+	if (err != 1)
+		physical_sector_size = sector_size;
+
 	info->feature_flush = 0;
 	info->flush_op = 0;
 
@@ -1483,7 +1807,15 @@ static void blkfront_connect(struct blkfront_info *info)
 	else
 		info->feature_persistent = persistent;
 
-	err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size);
+	err = blkfront_setup_indirect(info);
+	if (err) {
+		xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s",
+				 info->xbdev->otherend);
+		return;
+	}
+
+	err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size,
+				  physical_sector_size);
 	if (err) {
 		xenbus_dev_fatal(info->xbdev, err, "xlvbd_add at %s",
 				 info->xbdev->otherend);