summary refs log tree commit diff
path: root/net
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2016-01-14 16:08:23 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2016-01-14 16:08:23 -0800
commit75f26df6ae6f8787fc6198609c8be17400a69e25 (patch)
tree04d2173357b402ce1ee097ed7e40bcfc2153b937 /net
parent63f729cb4aa9a224cfd6bb35eab6b4556c29115d (diff)
parent44aab3e09ef947e546ee61c5082c41b86dd15e53 (diff)
downloadlinux-75f26df6ae6f8787fc6198609c8be17400a69e25.tar.gz
Merge tag 'nfs-for-4.5-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs
Pull NFS client updates from Trond Myklebust:
 "Highlights include:

  Stable fixes:
   - Fix a regression in the SunRPC socket polling code
   - Fix the attribute cache revalidation code
   - Fix race in __update_open_stateid()
   - Fix an lo->plh_block_lgets imbalance in layoutreturn
   - Fix an Oopsable typo in ff_mirror_match_fh()

  Features:
   - pNFS layout recall performance improvements.
   - pNFS/flexfiles: Support server-supplied layoutstats sampling period

  Bugfixes + cleanups:
   - NFSv4: Don't perform cached access checks before we've OPENed the
     file
   - Fix starvation issues with background flushes
   - Reclaim writes should be flushed as unstable writes if there are
     already entries in the commit lists
   - Various bugfixes from Chuck to fix NFS/RDMA send queue ordering
     problems
   - Ensure that we propagate fatal layoutget errors back to the
     application
   - Fixes for sundry flexfiles layoutstats bugs
   - Fix files/flexfiles to not cache invalidated layouts in the DS
     commit buckets"

* tag 'nfs-for-4.5-1' of git://git.linux-nfs.org/projects/trondmy/linux-nfs: (68 commits)
  NFS: Fix a compile warning about unused variable in nfs_generic_pg_pgios()
  NFSv4: Fix a compile warning about no prototype for nfs4_ioctl()
  NFS: Use wait_on_atomic_t() for unlock after readahead
  SUNRPC: Fixup socket wait for memory
  NFSv4.1/pNFS: Cleanup constify struct pnfs_layout_range arguments
  NFSv4.1/pnfs: Cleanup copying of pnfs_layout_range structures
  NFSv4.1/pNFS: Cleanup pnfs_mark_matching_lsegs_invalid()
  NFSv4.1/pNFS: Fix a race in initiate_file_draining()
  NFSv4.1/pNFS: pnfs_error_mark_layout_for_return() must always return layout
  NFSv4.1/pNFS: pnfs_mark_matching_lsegs_return() should set the iomode
  NFSv4.1/pNFS: Use nfs4_stateid_copy for copying stateids
  NFSv4.1/pNFS: Don't pass stateids by value to pnfs_send_layoutreturn()
  NFS: Relax requirements in nfs_flush_incompatible
  NFSv4.1/pNFS: Don't queue up a new commit if the layout segment is invalid
  NFS: Allow multiple commit requests in flight per file
  NFS/pNFS: Fix up pNFS write reschedule layering violations and bugs
  SUNRPC: Fix a missing break in rpc_anyaddr()
  pNFS/flexfiles: Fix an Oopsable typo in ff_mirror_match_fh()
  NFS: Fix attribute cache revalidation
  NFS: Ensure we revalidate attributes before using execute_ok()
  ...
Diffstat (limited to 'net')
-rw-r--r--net/sunrpc/clnt.c1
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c26
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c64
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c174
-rw-r--r--net/sunrpc/xprtrdma/physical_ops.c13
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c16
-rw-r--r--net/sunrpc/xprtrdma/transport.c3
-rw-r--r--net/sunrpc/xprtrdma/verbs.c16
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h14
-rw-r--r--net/sunrpc/xprtsock.c63
10 files changed, 298 insertions, 92 deletions
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 23608eb0ded2..b7f21044f4d8 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1217,6 +1217,7 @@ static int rpc_anyaddr(int family, struct sockaddr *buf, size_t buflen)
 			return -EINVAL;
 		memcpy(buf, &rpc_in6addr_loopback,
 				sizeof(rpc_in6addr_loopback));
+		break;
 	default:
 		dprintk("RPC:       %s: address family not supported\n",
 			__func__);
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 2dcb44f69e53..cc1251d07297 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -15,7 +15,7 @@
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
-#define RPCRDMA_BACKCHANNEL_DEBUG
+#undef RPCRDMA_BACKCHANNEL_DEBUG
 
 static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt,
 				 struct rpc_rqst *rqst)
@@ -42,8 +42,8 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
 	size_t size;
 
 	req = rpcrdma_create_req(r_xprt);
-	if (!req)
-		return -ENOMEM;
+	if (IS_ERR(req))
+		return PTR_ERR(req);
 	req->rl_backchannel = true;
 
 	size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
@@ -84,9 +84,7 @@ out_fail:
 static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt,
 				 unsigned int count)
 {
-	struct rpcrdma_buffer *buffers = &r_xprt->rx_buf;
 	struct rpcrdma_rep *rep;
-	unsigned long flags;
 	int rc = 0;
 
 	while (count--) {
@@ -98,9 +96,7 @@ static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt,
 			break;
 		}
 
-		spin_lock_irqsave(&buffers->rb_lock, flags);
-		list_add(&rep->rr_list, &buffers->rb_recv_bufs);
-		spin_unlock_irqrestore(&buffers->rb_lock, flags);
+		rpcrdma_recv_buffer_put(rep);
 	}
 
 	return rc;
@@ -140,6 +136,7 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
 			       __func__);
 			goto out_free;
 		}
+		dprintk("RPC:       %s: new rqst %p\n", __func__, rqst);
 
 		rqst->rq_xprt = &r_xprt->rx_xprt;
 		INIT_LIST_HEAD(&rqst->rq_list);
@@ -220,12 +217,14 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
 
 	rpclen = rqst->rq_svec[0].iov_len;
 
+#ifdef RPCRDMA_BACKCHANNEL_DEBUG
 	pr_info("RPC:       %s: rpclen %zd headerp 0x%p lkey 0x%x\n",
 		__func__, rpclen, headerp, rdmab_lkey(req->rl_rdmabuf));
 	pr_info("RPC:       %s: RPC/RDMA: %*ph\n",
 		__func__, (int)RPCRDMA_HDRLEN_MIN, headerp);
 	pr_info("RPC:       %s:      RPC: %*ph\n",
 		__func__, (int)rpclen, rqst->rq_svec[0].iov_base);
+#endif
 
 	req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
 	req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN;
@@ -269,6 +268,9 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
 {
 	struct rpc_xprt *xprt = rqst->rq_xprt;
 
+	dprintk("RPC:       %s: freeing rqst %p (req %p)\n",
+		__func__, rqst, rpcr_to_rdmar(rqst));
+
 	smp_mb__before_atomic();
 	WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state));
 	clear_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
@@ -333,9 +335,7 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
 				struct rpc_rqst, rq_bc_pa_list);
 	list_del(&rqst->rq_bc_pa_list);
 	spin_unlock(&xprt->bc_pa_lock);
-#ifdef RPCRDMA_BACKCHANNEL_DEBUG
-	pr_info("RPC:       %s: using rqst %p\n", __func__, rqst);
-#endif
+	dprintk("RPC:       %s: using rqst %p\n", __func__, rqst);
 
 	/* Prepare rqst */
 	rqst->rq_reply_bytes_recvd = 0;
@@ -355,10 +355,8 @@ void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
 	 * direction reply.
 	 */
 	req = rpcr_to_rdmar(rqst);
-#ifdef RPCRDMA_BACKCHANNEL_DEBUG
-	pr_info("RPC:       %s: attaching rep %p to req %p\n",
+	dprintk("RPC:       %s: attaching rep %p to req %p\n",
 		__func__, rep, req);
-#endif
 	req->rl_reply = rep;
 
 	/* Defeat the retransmit detection logic in send_request */
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index f1e8dafbd507..c14f3a4bff68 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -179,6 +179,69 @@ out_maperr:
 	return rc;
 }
 
+static void
+__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+{
+	struct ib_device *device = r_xprt->rx_ia.ri_device;
+	struct rpcrdma_mw *mw = seg->rl_mw;
+	int nsegs = seg->mr_nsegs;
+
+	seg->rl_mw = NULL;
+
+	while (nsegs--)
+		rpcrdma_unmap_one(device, seg++);
+
+	rpcrdma_put_mw(r_xprt, mw);
+}
+
+/* Invalidate all memory regions that were registered for "req".
+ *
+ * Sleeps until it is safe for the host CPU to access the
+ * previously mapped memory regions.
+ */
+static void
+fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+	struct rpcrdma_mr_seg *seg;
+	unsigned int i, nchunks;
+	struct rpcrdma_mw *mw;
+	LIST_HEAD(unmap_list);
+	int rc;
+
+	dprintk("RPC:       %s: req %p\n", __func__, req);
+
+	/* ORDER: Invalidate all of the req's MRs first
+	 *
+	 * ib_unmap_fmr() is slow, so use a single call instead
+	 * of one call per mapped MR.
+	 */
+	for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+		seg = &req->rl_segments[i];
+		mw = seg->rl_mw;
+
+		list_add(&mw->r.fmr.fmr->list, &unmap_list);
+
+		i += seg->mr_nsegs;
+	}
+	rc = ib_unmap_fmr(&unmap_list);
+	if (rc)
+		pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc);
+
+	/* ORDER: Now DMA unmap all of the req's MRs, and return
+	 * them to the free MW list.
+	 */
+	for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+		seg = &req->rl_segments[i];
+
+		__fmr_dma_unmap(r_xprt, seg);
+
+		i += seg->mr_nsegs;
+		seg->mr_nsegs = 0;
+	}
+
+	req->rl_nchunks = 0;
+}
+
 /* Use the ib_unmap_fmr() verb to prevent further remote
  * access via RDMA READ or RDMA WRITE.
  */
@@ -231,6 +294,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
 
 const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
 	.ro_map				= fmr_op_map,
+	.ro_unmap_sync			= fmr_op_unmap_sync,
 	.ro_unmap			= fmr_op_unmap,
 	.ro_open			= fmr_op_open,
 	.ro_maxpages			= fmr_op_maxpages,
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 88cf9e7269c2..c6836844bd0e 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -245,12 +245,14 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
 		     rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
 }
 
-/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */
+/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs
+ * to be reset.
+ *
+ * WARNING: Only wr_id and status are reliable at this point
+ */
 static void
-frwr_sendcompletion(struct ib_wc *wc)
+__frwr_sendcompletion_flush(struct ib_wc *wc, struct rpcrdma_mw *r)
 {
-	struct rpcrdma_mw *r;
-
 	if (likely(wc->status == IB_WC_SUCCESS))
 		return;
 
@@ -261,9 +263,23 @@ frwr_sendcompletion(struct ib_wc *wc)
 	else
 		pr_warn("RPC:       %s: frmr %p error, status %s (%d)\n",
 			__func__, r, ib_wc_status_msg(wc->status), wc->status);
+
 	r->r.frmr.fr_state = FRMR_IS_STALE;
 }
 
+static void
+frwr_sendcompletion(struct ib_wc *wc)
+{
+	struct rpcrdma_mw *r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
+	struct rpcrdma_frmr *f = &r->r.frmr;
+
+	if (unlikely(wc->status != IB_WC_SUCCESS))
+		__frwr_sendcompletion_flush(wc, r);
+
+	if (f->fr_waiter)
+		complete(&f->fr_linv_done);
+}
+
 static int
 frwr_op_init(struct rpcrdma_xprt *r_xprt)
 {
@@ -319,7 +335,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	struct rpcrdma_mw *mw;
 	struct rpcrdma_frmr *frmr;
 	struct ib_mr *mr;
-	struct ib_reg_wr reg_wr;
+	struct ib_reg_wr *reg_wr;
 	struct ib_send_wr *bad_wr;
 	int rc, i, n, dma_nents;
 	u8 key;
@@ -335,7 +351,9 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	} while (mw->r.frmr.fr_state != FRMR_IS_INVALID);
 	frmr = &mw->r.frmr;
 	frmr->fr_state = FRMR_IS_VALID;
+	frmr->fr_waiter = false;
 	mr = frmr->fr_mr;
+	reg_wr = &frmr->fr_regwr;
 
 	if (nsegs > ia->ri_max_frmr_depth)
 		nsegs = ia->ri_max_frmr_depth;
@@ -381,19 +399,19 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 	key = (u8)(mr->rkey & 0x000000FF);
 	ib_update_fast_reg_key(mr, ++key);
 
-	reg_wr.wr.next = NULL;
-	reg_wr.wr.opcode = IB_WR_REG_MR;
-	reg_wr.wr.wr_id = (uintptr_t)mw;
-	reg_wr.wr.num_sge = 0;
-	reg_wr.wr.send_flags = 0;
-	reg_wr.mr = mr;
-	reg_wr.key = mr->rkey;
-	reg_wr.access = writing ?
-			IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
-			IB_ACCESS_REMOTE_READ;
+	reg_wr->wr.next = NULL;
+	reg_wr->wr.opcode = IB_WR_REG_MR;
+	reg_wr->wr.wr_id = (uintptr_t)mw;
+	reg_wr->wr.num_sge = 0;
+	reg_wr->wr.send_flags = 0;
+	reg_wr->mr = mr;
+	reg_wr->key = mr->rkey;
+	reg_wr->access = writing ?
+			 IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+			 IB_ACCESS_REMOTE_READ;
 
 	DECR_CQCOUNT(&r_xprt->rx_ep);
-	rc = ib_post_send(ia->ri_id->qp, &reg_wr.wr, &bad_wr);
+	rc = ib_post_send(ia->ri_id->qp, &reg_wr->wr, &bad_wr);
 	if (rc)
 		goto out_senderr;
 
@@ -413,6 +431,116 @@ out_senderr:
 	return rc;
 }
 
+static struct ib_send_wr *
+__frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
+{
+	struct rpcrdma_mw *mw = seg->rl_mw;
+	struct rpcrdma_frmr *f = &mw->r.frmr;
+	struct ib_send_wr *invalidate_wr;
+
+	f->fr_waiter = false;
+	f->fr_state = FRMR_IS_INVALID;
+	invalidate_wr = &f->fr_invwr;
+
+	memset(invalidate_wr, 0, sizeof(*invalidate_wr));
+	invalidate_wr->wr_id = (unsigned long)(void *)mw;
+	invalidate_wr->opcode = IB_WR_LOCAL_INV;
+	invalidate_wr->ex.invalidate_rkey = f->fr_mr->rkey;
+
+	return invalidate_wr;
+}
+
+static void
+__frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
+		 int rc)
+{
+	struct ib_device *device = r_xprt->rx_ia.ri_device;
+	struct rpcrdma_mw *mw = seg->rl_mw;
+	struct rpcrdma_frmr *f = &mw->r.frmr;
+
+	seg->rl_mw = NULL;
+
+	ib_dma_unmap_sg(device, f->sg, f->sg_nents, seg->mr_dir);
+
+	if (!rc)
+		rpcrdma_put_mw(r_xprt, mw);
+	else
+		__frwr_queue_recovery(mw);
+}
+
+/* Invalidate all memory regions that were registered for "req".
+ *
+ * Sleeps until it is safe for the host CPU to access the
+ * previously mapped memory regions.
+ */
+static void
+frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+	struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr;
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	struct rpcrdma_mr_seg *seg;
+	unsigned int i, nchunks;
+	struct rpcrdma_frmr *f;
+	int rc;
+
+	dprintk("RPC:       %s: req %p\n", __func__, req);
+
+	/* ORDER: Invalidate all of the req's MRs first
+	 *
+	 * Chain the LOCAL_INV Work Requests and post them with
+	 * a single ib_post_send() call.
+	 */
+	invalidate_wrs = pos = prev = NULL;
+	seg = NULL;
+	for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+		seg = &req->rl_segments[i];
+
+		pos = __frwr_prepare_linv_wr(seg);
+
+		if (!invalidate_wrs)
+			invalidate_wrs = pos;
+		else
+			prev->next = pos;
+		prev = pos;
+
+		i += seg->mr_nsegs;
+	}
+	f = &seg->rl_mw->r.frmr;
+
+	/* Strong send queue ordering guarantees that when the
+	 * last WR in the chain completes, all WRs in the chain
+	 * are complete.
+	 */
+	f->fr_invwr.send_flags = IB_SEND_SIGNALED;
+	f->fr_waiter = true;
+	init_completion(&f->fr_linv_done);
+	INIT_CQCOUNT(&r_xprt->rx_ep);
+
+	/* Transport disconnect drains the receive CQ before it
+	 * replaces the QP. The RPC reply handler won't call us
+	 * unless ri_id->qp is a valid pointer.
+	 */
+	rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr);
+	if (rc)
+		pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
+
+	wait_for_completion(&f->fr_linv_done);
+
+	/* ORDER: Now DMA unmap all of the req's MRs, and return
+	 * them to the free MW list.
+	 */
+	for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+		seg = &req->rl_segments[i];
+
+		__frwr_dma_unmap(r_xprt, seg, rc);
+
+		i += seg->mr_nsegs;
+		seg->mr_nsegs = 0;
+	}
+
+	req->rl_nchunks = 0;
+}
+
 /* Post a LOCAL_INV Work Request to prevent further remote access
  * via RDMA READ or RDMA WRITE.
  */
@@ -423,23 +551,24 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct rpcrdma_mw *mw = seg1->rl_mw;
 	struct rpcrdma_frmr *frmr = &mw->r.frmr;
-	struct ib_send_wr invalidate_wr, *bad_wr;
+	struct ib_send_wr *invalidate_wr, *bad_wr;
 	int rc, nsegs = seg->mr_nsegs;
 
 	dprintk("RPC:       %s: FRMR %p\n", __func__, mw);
 
 	seg1->rl_mw = NULL;
 	frmr->fr_state = FRMR_IS_INVALID;
+	invalidate_wr = &mw->r.frmr.fr_invwr;
 
-	memset(&invalidate_wr, 0, sizeof(invalidate_wr));
-	invalidate_wr.wr_id = (unsigned long)(void *)mw;
-	invalidate_wr.opcode = IB_WR_LOCAL_INV;
-	invalidate_wr.ex.invalidate_rkey = frmr->fr_mr->rkey;
+	memset(invalidate_wr, 0, sizeof(*invalidate_wr));
+	invalidate_wr->wr_id = (uintptr_t)mw;
+	invalidate_wr->opcode = IB_WR_LOCAL_INV;
+	invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey;
 	DECR_CQCOUNT(&r_xprt->rx_ep);
 
 	ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir);
 	read_lock(&ia->ri_qplock);
-	rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+	rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr);
 	read_unlock(&ia->ri_qplock);
 	if (rc)
 		goto out_err;
@@ -471,6 +600,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf)
 
 const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
 	.ro_map				= frwr_op_map,
+	.ro_unmap_sync			= frwr_op_unmap_sync,
 	.ro_unmap			= frwr_op_unmap,
 	.ro_open			= frwr_op_open,
 	.ro_maxpages			= frwr_op_maxpages,
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index 617b76f22154..dbb302ecf590 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -83,6 +83,18 @@ physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
 	return 1;
 }
 
+/* DMA unmap all memory regions that were mapped for "req".
+ */
+static void
+physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+	struct ib_device *device = r_xprt->rx_ia.ri_device;
+	unsigned int i;
+
+	for (i = 0; req->rl_nchunks; --req->rl_nchunks)
+		rpcrdma_unmap_one(device, &req->rl_segments[i++]);
+}
+
 static void
 physical_op_destroy(struct rpcrdma_buffer *buf)
 {
@@ -90,6 +102,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf)
 
 const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
 	.ro_map				= physical_op_map,
+	.ro_unmap_sync			= physical_op_unmap_sync,
 	.ro_unmap			= physical_op_unmap,
 	.ro_open			= physical_op_open,
 	.ro_maxpages			= physical_op_maxpages,
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index c10d9699441c..0f28f2d743ed 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -804,6 +804,11 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
 	if (req->rl_reply)
 		goto out_duplicate;
 
+	/* Sanity checking has passed. We are now committed
+	 * to complete this transaction.
+	 */
+	list_del_init(&rqst->rq_list);
+	spin_unlock_bh(&xprt->transport_lock);
 	dprintk("RPC:       %s: reply 0x%p completes request 0x%p\n"
 		"                   RPC request 0x%p xid 0x%08x\n",
 			__func__, rep, req, rqst,
@@ -888,12 +893,23 @@ badheader:
 		break;
 	}
 
+	/* Invalidate and flush the data payloads before waking the
+	 * waiting application. This guarantees the memory region is
+	 * properly fenced from the server before the application
+	 * accesses the data. It also ensures proper send flow
+	 * control: waking the next RPC waits until this RPC has
+	 * relinquished all its Send Queue entries.
+	 */
+	if (req->rl_nchunks)
+		r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req);
+
 	credits = be32_to_cpu(headerp->rm_credit);
 	if (credits == 0)
 		credits = 1;	/* don't deadlock */
 	else if (credits > r_xprt->rx_buf.rb_max_requests)
 		credits = r_xprt->rx_buf.rb_max_requests;
 
+	spin_lock_bh(&xprt->transport_lock);
 	cwnd = xprt->cwnd;
 	xprt->cwnd = credits << RPC_CWNDSHIFT;
 	if (xprt->cwnd > cwnd)
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 8c545f7d7525..740bddcf3488 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -576,6 +576,9 @@ xprt_rdma_free(void *buffer)
 
 	rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]);
 	req = rb->rg_owner;
+	if (req->rl_backchannel)
+		return;
+
 	r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
 
 	dprintk("RPC:       %s: called on 0x%p\n", __func__, req->rl_reply);
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index eadd1655145a..732c71ce5dca 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -616,10 +616,8 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 
 	/* set trigger for requesting send completion */
 	ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
-	if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS)
-		ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS;
-	else if (ep->rep_cqinit <= 2)
-		ep->rep_cqinit = 0;
+	if (ep->rep_cqinit <= 2)
+		ep->rep_cqinit = 0;	/* always signal? */
 	INIT_CQCOUNT(ep);
 	init_waitqueue_head(&ep->rep_connect_wait);
 	INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
@@ -852,10 +850,11 @@ retry:
 
 		if (extras) {
 			rc = rpcrdma_ep_post_extra_recv(r_xprt, extras);
-			if (rc)
+			if (rc) {
 				pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n",
 					__func__, rc);
 				rc = 0;
+			}
 		}
 	}
 
@@ -1337,15 +1336,14 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct rpcrdma_ep *ep = &r_xprt->rx_ep;
 	struct rpcrdma_rep *rep;
-	unsigned long flags;
 	int rc;
 
 	while (count--) {
-		spin_lock_irqsave(&buffers->rb_lock, flags);
+		spin_lock(&buffers->rb_lock);
 		if (list_empty(&buffers->rb_recv_bufs))
 			goto out_reqbuf;
 		rep = rpcrdma_buffer_get_rep_locked(buffers);
-		spin_unlock_irqrestore(&buffers->rb_lock, flags);
+		spin_unlock(&buffers->rb_lock);
 
 		rc = rpcrdma_ep_post_recv(ia, ep, rep);
 		if (rc)
@@ -1355,7 +1353,7 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
 	return 0;
 
 out_reqbuf:
-	spin_unlock_irqrestore(&buffers->rb_lock, flags);
+	spin_unlock(&buffers->rb_lock);
 	pr_warn("%s: no extra receive buffers\n", __func__);
 	return -ENOMEM;
 
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index ac7f8d4f632a..728101ddc44b 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -88,12 +88,6 @@ struct rpcrdma_ep {
 	struct delayed_work	rep_connect_worker;
 };
 
-/*
- * Force a signaled SEND Work Request every so often,
- * in case the provider needs to do some housekeeping.
- */
-#define RPCRDMA_MAX_UNSIGNALED_SENDS	(32)
-
 #define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
 #define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
 
@@ -207,6 +201,12 @@ struct rpcrdma_frmr {
 	enum rpcrdma_frmr_state		fr_state;
 	struct work_struct		fr_work;
 	struct rpcrdma_xprt		*fr_xprt;
+	bool				fr_waiter;
+	struct completion		fr_linv_done;;
+	union {
+		struct ib_reg_wr	fr_regwr;
+		struct ib_send_wr	fr_invwr;
+	};
 };
 
 struct rpcrdma_fmr {
@@ -364,6 +364,8 @@ struct rpcrdma_xprt;
 struct rpcrdma_memreg_ops {
 	int		(*ro_map)(struct rpcrdma_xprt *,
 				  struct rpcrdma_mr_seg *, int, bool);
+	void		(*ro_unmap_sync)(struct rpcrdma_xprt *,
+					 struct rpcrdma_req *);
 	int		(*ro_unmap)(struct rpcrdma_xprt *,
 				    struct rpcrdma_mr_seg *);
 	int		(*ro_open)(struct rpcrdma_ia *,
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 2ffaf6a79499..fde2138b81e7 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -398,7 +398,6 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen,
 	if (unlikely(!sock))
 		return -ENOTSOCK;
 
-	clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags);
 	if (base != 0) {
 		addr = NULL;
 		addrlen = 0;
@@ -442,7 +441,6 @@ static void xs_nospace_callback(struct rpc_task *task)
 	struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt);
 
 	transport->inet->sk_write_pending--;
-	clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags);
 }
 
 /**
@@ -467,20 +465,11 @@ static int xs_nospace(struct rpc_task *task)
 
 	/* Don't race with disconnect */
 	if (xprt_connected(xprt)) {
-		if (test_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags)) {
-			/*
-			 * Notify TCP that we're limited by the application
-			 * window size
-			 */
-			set_bit(SOCK_NOSPACE, &transport->sock->flags);
-			sk->sk_write_pending++;
-			/* ...and wait for more buffer space */
-			xprt_wait_for_buffer_space(task, xs_nospace_callback);
-		}
-	} else {
-		clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags);
+		/* wait for more buffer space */
+		sk->sk_write_pending++;
+		xprt_wait_for_buffer_space(task, xs_nospace_callback);
+	} else
 		ret = -ENOTCONN;
-	}
 
 	spin_unlock_bh(&xprt->transport_lock);
 
@@ -616,9 +605,6 @@ process_status:
 	case -EAGAIN:
 		status = xs_nospace(task);
 		break;
-	default:
-		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
-			-status);
 	case -ENETUNREACH:
 	case -ENOBUFS:
 	case -EPIPE:
@@ -626,7 +612,10 @@ process_status:
 	case -EPERM:
 		/* When the server has died, an ICMP port unreachable message
 		 * prompts ECONNREFUSED. */
-		clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags);
+		break;
+	default:
+		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
+			-status);
 	}
 
 	return status;
@@ -706,16 +695,16 @@ static int xs_tcp_send_request(struct rpc_task *task)
 	case -EAGAIN:
 		status = xs_nospace(task);
 		break;
-	default:
-		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
-			-status);
 	case -ECONNRESET:
 	case -ECONNREFUSED:
 	case -ENOTCONN:
 	case -EADDRINUSE:
 	case -ENOBUFS:
 	case -EPIPE:
-		clear_bit(SOCKWQ_ASYNC_NOSPACE, &transport->sock->flags);
+		break;
+	default:
+		dprintk("RPC:       sendmsg returned unrecognized error %d\n",
+			-status);
 	}
 
 	return status;
@@ -1609,19 +1598,23 @@ static void xs_tcp_state_change(struct sock *sk)
 
 static void xs_write_space(struct sock *sk)
 {
-	struct socket *sock;
+	struct socket_wq *wq;
 	struct rpc_xprt *xprt;
 
-	if (unlikely(!(sock = sk->sk_socket)))
+	if (!sk->sk_socket)
 		return;
-	clear_bit(SOCK_NOSPACE, &sock->flags);
+	clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
 
 	if (unlikely(!(xprt = xprt_from_sock(sk))))
 		return;
-	if (test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &sock->flags) == 0)
-		return;
+	rcu_read_lock();
+	wq = rcu_dereference(sk->sk_wq);
+	if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0)
+		goto out;
 
 	xprt_write_space(xprt);
+out:
+	rcu_read_unlock();
 }
 
 /**
@@ -1907,18 +1900,6 @@ static inline void xs_reclassify_socket(int family, struct socket *sock)
 	}
 }
 #else
-static inline void xs_reclassify_socketu(struct socket *sock)
-{
-}
-
-static inline void xs_reclassify_socket4(struct socket *sock)
-{
-}
-
-static inline void xs_reclassify_socket6(struct socket *sock)
-{
-}
-
 static inline void xs_reclassify_socket(int family, struct socket *sock)
 {
 }
@@ -2008,7 +1989,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
 			"transport socket (%d).\n", -status);
 		goto out;
 	}
-	xs_reclassify_socketu(sock);
+	xs_reclassify_socket(AF_LOCAL, sock);
 
 	dprintk("RPC:       worker connecting xprt %p via AF_LOCAL to %s\n",
 			xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);