summary refs log tree commit diff
path: root/drivers/infiniband/hw
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-11 19:43:13 -0700
committerLinus Torvalds <torvalds@woody.linux-foundation.org>2007-10-11 19:43:13 -0700
commitce9d3c9a6a9aef61525be07fe6ba27d937236aa2 (patch)
tree1b29bcb8f60fc6b59fa0d7b833cc733b8ebe17c9 /drivers/infiniband/hw
parent038a5008b2f395c85e6e71d6ddf3c684e7c405b0 (diff)
parent3d73c2884f45f9a297cbc956cea101405a9703f2 (diff)
downloadlinux-ce9d3c9a6a9aef61525be07fe6ba27d937236aa2.tar.gz
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband
* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/roland/infiniband: (87 commits)
  mlx4_core: Fix section mismatches
  IPoIB: Allow setting policy to ignore multicast groups
  IB/mthca: Mark error paths as unlikely() in post_srq_recv functions
  IB/ipath: Minor fix to ordering of freeing and zeroing of tid pages.
  IB/ipath: Remove redundant link state checks
  IB/ipath: Fix IB_EVENT_PORT_ERR event
  IB/ipath: Better handling of unexpected GPIO interrupts
  IB/ipath: Maintain active time on all chips
  IB/ipath: Fix QHT7040 serial number check
  IB/ipath: Indicate a couple of chip bugs to userspace
  IB/ipath: iba6110 rev4 no longer needs recv header overrun workaround
  IB/ipath: Use counters in ipath_poll and cleanup interrupts in ipath_close
  IB/ipath: Remove duplicate copy of LMC
  IB/ipath: Add ability to set the LMC via the sysfs debugging interface
  IB/ipath: Optimize completion queue entry insertion and polling
  IB/ipath: Implement IB_EVENT_QP_LAST_WQE_REACHED
  IB/ipath: Generate flush CQE when QP is in error state
  IB/ipath: Remove redundant code
  IB/ipath: Future proof eeprom checksum code (contents reading)
  IB/ipath: UC RDMA WRITE with IMMEDIATE doesn't send the immediate
  ...
Diffstat (limited to 'drivers/infiniband/hw')
-rw-r--r--drivers/infiniband/hw/cxgb3/iwch_cm.c16
-rw-r--r--drivers/infiniband/hw/ehca/ehca_classes.h14
-rw-r--r--drivers/infiniband/hw/ehca/ehca_cq.c23
-rw-r--r--drivers/infiniband/hw/ehca/ehca_hca.c34
-rw-r--r--drivers/infiniband/hw/ehca/ehca_irq.c33
-rw-r--r--drivers/infiniband/hw/ehca/ehca_main.c52
-rw-r--r--drivers/infiniband/hw/ehca/ehca_mcast.c4
-rw-r--r--drivers/infiniband/hw/ehca/ehca_mrmw.c102
-rw-r--r--drivers/infiniband/hw/ehca/ehca_qp.c169
-rw-r--r--drivers/infiniband/hw/ehca/ehca_reqs.c2
-rw-r--r--drivers/infiniband/hw/ehca/ehca_sqp.c2
-rw-r--r--drivers/infiniband/hw/ehca/ehca_tools.h19
-rw-r--r--drivers/infiniband/hw/ehca/ehca_uverbs.c46
-rw-r--r--drivers/infiniband/hw/ehca/hcp_if.c105
-rw-r--r--drivers/infiniband/hw/ehca/ipz_pt_fn.c1
-rw-r--r--drivers/infiniband/hw/ipath/ipath_common.h4
-rw-r--r--drivers/infiniband/hw/ipath/ipath_cq.c94
-rw-r--r--drivers/infiniband/hw/ipath/ipath_diag.c22
-rw-r--r--drivers/infiniband/hw/ipath/ipath_driver.c93
-rw-r--r--drivers/infiniband/hw/ipath/ipath_eeprom.c10
-rw-r--r--drivers/infiniband/hw/ipath/ipath_file_ops.c74
-rw-r--r--drivers/infiniband/hw/ipath/ipath_fs.c187
-rw-r--r--drivers/infiniband/hw/ipath/ipath_iba6110.c57
-rw-r--r--drivers/infiniband/hw/ipath/ipath_iba6120.c18
-rw-r--r--drivers/infiniband/hw/ipath/ipath_intr.c64
-rw-r--r--drivers/infiniband/hw/ipath/ipath_kernel.h12
-rw-r--r--drivers/infiniband/hw/ipath/ipath_mad.c53
-rw-r--r--drivers/infiniband/hw/ipath/ipath_qp.c31
-rw-r--r--drivers/infiniband/hw/ipath/ipath_rc.c73
-rw-r--r--drivers/infiniband/hw/ipath/ipath_ruc.c308
-rw-r--r--drivers/infiniband/hw/ipath/ipath_stats.c17
-rw-r--r--drivers/infiniband/hw/ipath/ipath_sysfs.c40
-rw-r--r--drivers/infiniband/hw/ipath/ipath_uc.c98
-rw-r--r--drivers/infiniband/hw/ipath/ipath_ud.c382
-rw-r--r--drivers/infiniband/hw/ipath/ipath_verbs.c329
-rw-r--r--drivers/infiniband/hw/ipath/ipath_verbs.h45
-rw-r--r--drivers/infiniband/hw/mlx4/main.c50
-rw-r--r--drivers/infiniband/hw/mlx4/mlx4_ib.h16
-rw-r--r--drivers/infiniband/hw/mlx4/mr.c100
-rw-r--r--drivers/infiniband/hw/mlx4/qp.c14
-rw-r--r--drivers/infiniband/hw/mlx4/srq.c2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_cmd.c6
-rw-r--r--drivers/infiniband/hw/mthca/mthca_dev.h2
-rw-r--r--drivers/infiniband/hw/mthca/mthca_main.c110
-rw-r--r--drivers/infiniband/hw/mthca/mthca_srq.c8
45 files changed, 1544 insertions, 1397 deletions
diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c
index 1cdfcd43b0bc..20ba372dd182 100644
--- a/drivers/infiniband/hw/cxgb3/iwch_cm.c
+++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c
@@ -63,37 +63,37 @@ static char *states[] = {
 };
 
 static int ep_timeout_secs = 10;
-module_param(ep_timeout_secs, int, 0444);
+module_param(ep_timeout_secs, int, 0644);
 MODULE_PARM_DESC(ep_timeout_secs, "CM Endpoint operation timeout "
 				   "in seconds (default=10)");
 
 static int mpa_rev = 1;
-module_param(mpa_rev, int, 0444);
+module_param(mpa_rev, int, 0644);
 MODULE_PARM_DESC(mpa_rev, "MPA Revision, 0 supports amso1100, "
 		 "1 is spec compliant. (default=1)");
 
 static int markers_enabled = 0;
-module_param(markers_enabled, int, 0444);
+module_param(markers_enabled, int, 0644);
 MODULE_PARM_DESC(markers_enabled, "Enable MPA MARKERS (default(0)=disabled)");
 
 static int crc_enabled = 1;
-module_param(crc_enabled, int, 0444);
+module_param(crc_enabled, int, 0644);
 MODULE_PARM_DESC(crc_enabled, "Enable MPA CRC (default(1)=enabled)");
 
 static int rcv_win = 256 * 1024;
-module_param(rcv_win, int, 0444);
+module_param(rcv_win, int, 0644);
 MODULE_PARM_DESC(rcv_win, "TCP receive window in bytes (default=256)");
 
 static int snd_win = 32 * 1024;
-module_param(snd_win, int, 0444);
+module_param(snd_win, int, 0644);
 MODULE_PARM_DESC(snd_win, "TCP send window in bytes (default=32KB)");
 
 static unsigned int nocong = 0;
-module_param(nocong, uint, 0444);
+module_param(nocong, uint, 0644);
 MODULE_PARM_DESC(nocong, "Turn off congestion control (default=0)");
 
 static unsigned int cong_flavor = 1;
-module_param(cong_flavor, uint, 0444);
+module_param(cong_flavor, uint, 0644);
 MODULE_PARM_DESC(cong_flavor, "TCP Congestion control flavor (default=1)");
 
 static void process_work(struct work_struct *work);
diff --git a/drivers/infiniband/hw/ehca/ehca_classes.h b/drivers/infiniband/hw/ehca/ehca_classes.h
index b5e960305316..0f7a55d35ea7 100644
--- a/drivers/infiniband/hw/ehca/ehca_classes.h
+++ b/drivers/infiniband/hw/ehca/ehca_classes.h
@@ -53,6 +53,7 @@ struct ehca_pd;
 struct ehca_av;
 
 #include <linux/wait.h>
+#include <linux/mutex.h>
 
 #include <rdma/ib_verbs.h>
 #include <rdma/ib_user_verbs.h>
@@ -99,10 +100,10 @@ struct ehca_sport {
 	struct ehca_sma_attr saved_attr;
 };
 
-#define HCA_CAP_MR_PGSIZE_4K  1
-#define HCA_CAP_MR_PGSIZE_64K 2
-#define HCA_CAP_MR_PGSIZE_1M  4
-#define HCA_CAP_MR_PGSIZE_16M 8
+#define HCA_CAP_MR_PGSIZE_4K  0x80000000
+#define HCA_CAP_MR_PGSIZE_64K 0x40000000
+#define HCA_CAP_MR_PGSIZE_1M  0x20000000
+#define HCA_CAP_MR_PGSIZE_16M 0x10000000
 
 struct ehca_shca {
 	struct ib_device ib_device;
@@ -337,6 +338,8 @@ struct ehca_create_cq_resp {
 	u32 cq_number;
 	u32 token;
 	struct ipzu_queue_resp ipz_queue;
+	u32 fw_handle_ofs;
+	u32 dummy;
 };
 
 struct ehca_create_qp_resp {
@@ -347,7 +350,8 @@ struct ehca_create_qp_resp {
 	u32 qkey;
 	/* qp_num assigned by ehca: sqp0/1 may have got different numbers */
 	u32 real_qp_num;
-	u32 dummy; /* padding for 8 byte alignment */
+	u32 fw_handle_ofs;
+	u32 dummy;
 	struct ipzu_queue_resp ipz_squeue;
 	struct ipzu_queue_resp ipz_rqueue;
 };
diff --git a/drivers/infiniband/hw/ehca/ehca_cq.c b/drivers/infiniband/hw/ehca/ehca_cq.c
index 81aff36101ba..79c25f51c21e 100644
--- a/drivers/infiniband/hw/ehca/ehca_cq.c
+++ b/drivers/infiniband/hw/ehca/ehca_cq.c
@@ -166,7 +166,6 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector,
 		write_lock_irqsave(&ehca_cq_idr_lock, flags);
 		ret = idr_get_new(&ehca_cq_idr, my_cq, &my_cq->token);
 		write_unlock_irqrestore(&ehca_cq_idr_lock, flags);
-
 	} while (ret == -EAGAIN);
 
 	if (ret) {
@@ -176,6 +175,12 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector,
 		goto create_cq_exit1;
 	}
 
+	if (my_cq->token > 0x1FFFFFF) {
+		cq = ERR_PTR(-ENOMEM);
+		ehca_err(device, "Invalid number of cq. device=%p", device);
+		goto create_cq_exit2;
+	}
+
 	/*
 	 * CQs maximum depth is 4GB-64, but we need additional 20 as buffer
 	 * for receiving errors CQEs.
@@ -185,7 +190,7 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector,
 
 	if (h_ret != H_SUCCESS) {
 		ehca_err(device, "hipz_h_alloc_resource_cq() failed "
-			 "h_ret=%lx device=%p", h_ret, device);
+			 "h_ret=%li device=%p", h_ret, device);
 		cq = ERR_PTR(ehca2ib_return_code(h_ret));
 		goto create_cq_exit2;
 	}
@@ -193,7 +198,7 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector,
 	ipz_rc = ipz_queue_ctor(NULL, &my_cq->ipz_queue, param.act_pages,
 				EHCA_PAGESIZE, sizeof(struct ehca_cqe), 0, 0);
 	if (!ipz_rc) {
-		ehca_err(device, "ipz_queue_ctor() failed ipz_rc=%x device=%p",
+		ehca_err(device, "ipz_queue_ctor() failed ipz_rc=%i device=%p",
 			 ipz_rc, device);
 		cq = ERR_PTR(-EINVAL);
 		goto create_cq_exit3;
@@ -221,7 +226,7 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector,
 
 		if (h_ret < H_SUCCESS) {
 			ehca_err(device, "hipz_h_register_rpage_cq() failed "
-				 "ehca_cq=%p cq_num=%x h_ret=%lx counter=%i "
+				 "ehca_cq=%p cq_num=%x h_ret=%li counter=%i "
 				 "act_pages=%i", my_cq, my_cq->cq_number,
 				 h_ret, counter, param.act_pages);
 			cq = ERR_PTR(-EINVAL);
@@ -233,7 +238,7 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector,
 			if ((h_ret != H_SUCCESS) || vpage) {
 				ehca_err(device, "Registration of pages not "
 					 "complete ehca_cq=%p cq_num=%x "
-					 "h_ret=%lx", my_cq, my_cq->cq_number,
+					 "h_ret=%li", my_cq, my_cq->cq_number,
 					 h_ret);
 				cq = ERR_PTR(-EAGAIN);
 				goto create_cq_exit4;
@@ -241,7 +246,7 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector,
 		} else {
 			if (h_ret != H_PAGE_REGISTERED) {
 				ehca_err(device, "Registration of page failed "
-					 "ehca_cq=%p cq_num=%x h_ret=%lx"
+					 "ehca_cq=%p cq_num=%x h_ret=%li"
 					 "counter=%i act_pages=%i",
 					 my_cq, my_cq->cq_number,
 					 h_ret, counter, param.act_pages);
@@ -276,6 +281,8 @@ struct ib_cq *ehca_create_cq(struct ib_device *device, int cqe, int comp_vector,
 		resp.ipz_queue.queue_length = ipz_queue->queue_length;
 		resp.ipz_queue.pagesize = ipz_queue->pagesize;
 		resp.ipz_queue.toggle_state = ipz_queue->toggle_state;
+		resp.fw_handle_ofs = (u32)
+			(my_cq->galpas.user.fw_handle & (PAGE_SIZE - 1));
 		if (ib_copy_to_udata(udata, &resp, sizeof(resp))) {
 			ehca_err(device, "Copy to udata failed.");
 			goto create_cq_exit4;
@@ -291,7 +298,7 @@ create_cq_exit3:
 	h_ret = hipz_h_destroy_cq(adapter_handle, my_cq, 1);
 	if (h_ret != H_SUCCESS)
 		ehca_err(device, "hipz_h_destroy_cq() failed ehca_cq=%p "
-			 "cq_num=%x h_ret=%lx", my_cq, my_cq->cq_number, h_ret);
+			 "cq_num=%x h_ret=%li", my_cq, my_cq->cq_number, h_ret);
 
 create_cq_exit2:
 	write_lock_irqsave(&ehca_cq_idr_lock, flags);
@@ -355,7 +362,7 @@ int ehca_destroy_cq(struct ib_cq *cq)
 				 cq_num);
 	}
 	if (h_ret != H_SUCCESS) {
-		ehca_err(device, "hipz_h_destroy_cq() failed h_ret=%lx "
+		ehca_err(device, "hipz_h_destroy_cq() failed h_ret=%li "
 			 "ehca_cq=%p cq_num=%x", h_ret, my_cq, cq_num);
 		return ehca2ib_return_code(h_ret);
 	}
diff --git a/drivers/infiniband/hw/ehca/ehca_hca.c b/drivers/infiniband/hw/ehca/ehca_hca.c
index cf22472d9414..4aa3ffa6a19f 100644
--- a/drivers/infiniband/hw/ehca/ehca_hca.c
+++ b/drivers/infiniband/hw/ehca/ehca_hca.c
@@ -82,17 +82,17 @@ int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props)
 	props->vendor_id       = rblock->vendor_id >> 8;
 	props->vendor_part_id  = rblock->vendor_part_id >> 16;
 	props->hw_ver          = rblock->hw_ver;
-	props->max_qp          = min_t(int, rblock->max_qp, INT_MAX);
-	props->max_qp_wr       = min_t(int, rblock->max_wqes_wq, INT_MAX);
-	props->max_sge         = min_t(int, rblock->max_sge, INT_MAX);
-	props->max_sge_rd      = min_t(int, rblock->max_sge_rd, INT_MAX);
-	props->max_cq          = min_t(int, rblock->max_cq, INT_MAX);
-	props->max_cqe         = min_t(int, rblock->max_cqe, INT_MAX);
-	props->max_mr          = min_t(int, rblock->max_mr, INT_MAX);
-	props->max_mw          = min_t(int, rblock->max_mw, INT_MAX);
-	props->max_pd          = min_t(int, rblock->max_pd, INT_MAX);
-	props->max_ah          = min_t(int, rblock->max_ah, INT_MAX);
-	props->max_fmr         = min_t(int, rblock->max_mr, INT_MAX);
+	props->max_qp          = min_t(unsigned, rblock->max_qp, INT_MAX);
+	props->max_qp_wr       = min_t(unsigned, rblock->max_wqes_wq, INT_MAX);
+	props->max_sge         = min_t(unsigned, rblock->max_sge, INT_MAX);
+	props->max_sge_rd      = min_t(unsigned, rblock->max_sge_rd, INT_MAX);
+	props->max_cq          = min_t(unsigned, rblock->max_cq, INT_MAX);
+	props->max_cqe         = min_t(unsigned, rblock->max_cqe, INT_MAX);
+	props->max_mr          = min_t(unsigned, rblock->max_mr, INT_MAX);
+	props->max_mw          = min_t(unsigned, rblock->max_mw, INT_MAX);
+	props->max_pd          = min_t(unsigned, rblock->max_pd, INT_MAX);
+	props->max_ah          = min_t(unsigned, rblock->max_ah, INT_MAX);
+	props->max_fmr         = min_t(unsigned, rblock->max_mr, INT_MAX);
 
 	if (EHCA_BMASK_GET(HCA_CAP_SRQ, shca->hca_cap)) {
 		props->max_srq         = props->max_qp;
@@ -104,15 +104,15 @@ int ehca_query_device(struct ib_device *ibdev, struct ib_device_attr *props)
 	props->local_ca_ack_delay
 		= rblock->local_ca_ack_delay;
 	props->max_raw_ipv6_qp
-		= min_t(int, rblock->max_raw_ipv6_qp, INT_MAX);
+		= min_t(unsigned, rblock->max_raw_ipv6_qp, INT_MAX);
 	props->max_raw_ethy_qp
-		= min_t(int, rblock->max_raw_ethy_qp, INT_MAX);
+		= min_t(unsigned, rblock->max_raw_ethy_qp, INT_MAX);
 	props->max_mcast_grp
-		= min_t(int, rblock->max_mcast_grp, INT_MAX);
+		= min_t(unsigned, rblock->max_mcast_grp, INT_MAX);
 	props->max_mcast_qp_attach
-		= min_t(int, rblock->max_mcast_qp_attach, INT_MAX);
+		= min_t(unsigned, rblock->max_mcast_qp_attach, INT_MAX);
 	props->max_total_mcast_qp_attach
-		= min_t(int, rblock->max_total_mcast_qp_attach, INT_MAX);
+		= min_t(unsigned, rblock->max_total_mcast_qp_attach, INT_MAX);
 
 	/* translate device capabilities */
 	props->device_cap_flags = IB_DEVICE_SYS_IMAGE_GUID |
@@ -352,7 +352,7 @@ int ehca_modify_port(struct ib_device *ibdev,
 	hret = hipz_h_modify_port(shca->ipz_hca_handle, port,
 				  cap, props->init_type, port_modify_mask);
 	if (hret != H_SUCCESS) {
-		ehca_err(&shca->ib_device, "Modify port failed  hret=%lx",
+		ehca_err(&shca->ib_device, "Modify port failed  h_ret=%li",
 			 hret);
 		ret = -EINVAL;
 	}
diff --git a/drivers/infiniband/hw/ehca/ehca_irq.c b/drivers/infiniband/hw/ehca/ehca_irq.c
index a925ea52443f..3f617b27b954 100644
--- a/drivers/infiniband/hw/ehca/ehca_irq.c
+++ b/drivers/infiniband/hw/ehca/ehca_irq.c
@@ -69,9 +69,6 @@
 static void queue_comp_task(struct ehca_cq *__cq);
 
 static struct ehca_comp_pool *pool;
-#ifdef CONFIG_HOTPLUG_CPU
-static struct notifier_block comp_pool_callback_nb;
-#endif
 
 static inline void comp_event_callback(struct ehca_cq *cq)
 {
@@ -294,8 +291,8 @@ static void parse_identifier(struct ehca_shca *shca, u64 eqe)
 	case 0x11: /* unaffiliated access error */
 		ehca_err(&shca->ib_device, "Unaffiliated access error.");
 		break;
-	case 0x12: /* path migrating error */
-		ehca_err(&shca->ib_device, "Path migration error.");
+	case 0x12: /* path migrating */
+		ehca_err(&shca->ib_device, "Path migrating.");
 		break;
 	case 0x13: /* interface trace stopped */
 		ehca_err(&shca->ib_device, "Interface trace stopped.");
@@ -760,9 +757,7 @@ static void destroy_comp_task(struct ehca_comp_pool *pool,
 		kthread_stop(task);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-static void take_over_work(struct ehca_comp_pool *pool,
-			   int cpu)
+static void __cpuinit take_over_work(struct ehca_comp_pool *pool, int cpu)
 {
 	struct ehca_cpu_comp_task *cct = per_cpu_ptr(pool->cpu_comp_tasks, cpu);
 	LIST_HEAD(list);
@@ -785,9 +780,9 @@ static void take_over_work(struct ehca_comp_pool *pool,
 
 }
 
-static int comp_pool_callback(struct notifier_block *nfb,
-			      unsigned long action,
-			      void *hcpu)
+static int __cpuinit comp_pool_callback(struct notifier_block *nfb,
+					unsigned long action,
+					void *hcpu)
 {
 	unsigned int cpu = (unsigned long)hcpu;
 	struct ehca_cpu_comp_task *cct;
@@ -833,7 +828,11 @@ static int comp_pool_callback(struct notifier_block *nfb,
 
 	return NOTIFY_OK;
 }
-#endif
+
+static struct notifier_block comp_pool_callback_nb __cpuinitdata = {
+	.notifier_call	= comp_pool_callback,
+	.priority	= 0,
+};
 
 int ehca_create_comp_pool(void)
 {
@@ -864,11 +863,7 @@ int ehca_create_comp_pool(void)
 		}
 	}
 
-#ifdef CONFIG_HOTPLUG_CPU
-	comp_pool_callback_nb.notifier_call = comp_pool_callback;
-	comp_pool_callback_nb.priority = 0;
-	register_cpu_notifier(&comp_pool_callback_nb);
-#endif
+	register_hotcpu_notifier(&comp_pool_callback_nb);
 
 	printk(KERN_INFO "eHCA scaling code enabled\n");
 
@@ -882,9 +877,7 @@ void ehca_destroy_comp_pool(void)
 	if (!ehca_scaling_code)
 		return;
 
-#ifdef CONFIG_HOTPLUG_CPU
-	unregister_cpu_notifier(&comp_pool_callback_nb);
-#endif
+	unregister_hotcpu_notifier(&comp_pool_callback_nb);
 
 	for (i = 0; i < NR_CPUS; i++) {
 		if (cpu_online(i))
diff --git a/drivers/infiniband/hw/ehca/ehca_main.c b/drivers/infiniband/hw/ehca/ehca_main.c
index 99036b65bb84..403467f66fe6 100644
--- a/drivers/infiniband/hw/ehca/ehca_main.c
+++ b/drivers/infiniband/hw/ehca/ehca_main.c
@@ -49,10 +49,12 @@
 #include "ehca_tools.h"
 #include "hcp_if.h"
 
+#define HCAD_VERSION "0024"
+
 MODULE_LICENSE("Dual BSD/GPL");
 MODULE_AUTHOR("Christoph Raisch <raisch@de.ibm.com>");
 MODULE_DESCRIPTION("IBM eServer HCA InfiniBand Device Driver");
-MODULE_VERSION("SVNEHCA_0023");
+MODULE_VERSION(HCAD_VERSION);
 
 int ehca_open_aqp1     = 0;
 int ehca_debug_level   = 0;
@@ -65,16 +67,16 @@ int ehca_static_rate   = -1;
 int ehca_scaling_code  = 0;
 int ehca_mr_largepage  = 0;
 
-module_param_named(open_aqp1,     ehca_open_aqp1,     int, 0);
-module_param_named(debug_level,   ehca_debug_level,   int, 0);
-module_param_named(hw_level,      ehca_hw_level,      int, 0);
-module_param_named(nr_ports,      ehca_nr_ports,      int, 0);
-module_param_named(use_hp_mr,     ehca_use_hp_mr,     int, 0);
-module_param_named(port_act_time, ehca_port_act_time, int, 0);
-module_param_named(poll_all_eqs,  ehca_poll_all_eqs,  int, 0);
-module_param_named(static_rate,   ehca_static_rate,   int, 0);
-module_param_named(scaling_code,  ehca_scaling_code,  int, 0);
-module_param_named(mr_largepage,  ehca_mr_largepage,  int, 0);
+module_param_named(open_aqp1,     ehca_open_aqp1,     int, S_IRUGO);
+module_param_named(debug_level,   ehca_debug_level,   int, S_IRUGO);
+module_param_named(hw_level,      ehca_hw_level,      int, S_IRUGO);
+module_param_named(nr_ports,      ehca_nr_ports,      int, S_IRUGO);
+module_param_named(use_hp_mr,     ehca_use_hp_mr,     int, S_IRUGO);
+module_param_named(port_act_time, ehca_port_act_time, int, S_IRUGO);
+module_param_named(poll_all_eqs,  ehca_poll_all_eqs,  int, S_IRUGO);
+module_param_named(static_rate,   ehca_static_rate,   int, S_IRUGO);
+module_param_named(scaling_code,  ehca_scaling_code,  int, S_IRUGO);
+module_param_named(mr_largepage,  ehca_mr_largepage,  int, S_IRUGO);
 
 MODULE_PARM_DESC(open_aqp1,
 		 "AQP1 on startup (0: no (default), 1: yes)");
@@ -273,7 +275,7 @@ int ehca_sense_attributes(struct ehca_shca *shca)
 
 	h_ret = hipz_h_query_hca(shca->ipz_hca_handle, rblock);
 	if (h_ret != H_SUCCESS) {
-		ehca_gen_err("Cannot query device properties. h_ret=%lx",
+		ehca_gen_err("Cannot query device properties. h_ret=%li",
 			     h_ret);
 		ret = -EPERM;
 		goto sense_attributes1;
@@ -332,7 +334,7 @@ int ehca_sense_attributes(struct ehca_shca *shca)
 	port = (struct hipz_query_port *)rblock;
 	h_ret = hipz_h_query_port(shca->ipz_hca_handle, 1, port);
 	if (h_ret != H_SUCCESS) {
-		ehca_gen_err("Cannot query port properties. h_ret=%lx",
+		ehca_gen_err("Cannot query port properties. h_ret=%li",
 			     h_ret);
 		ret = -EPERM;
 		goto sense_attributes1;
@@ -380,7 +382,7 @@ int ehca_init_device(struct ehca_shca *shca)
 	strlcpy(shca->ib_device.name, "ehca%d", IB_DEVICE_NAME_MAX);
 	shca->ib_device.owner               = THIS_MODULE;
 
-	shca->ib_device.uverbs_abi_ver	    = 7;
+	shca->ib_device.uverbs_abi_ver	    = 8;
 	shca->ib_device.uverbs_cmd_mask	    =
 		(1ull << IB_USER_VERBS_CMD_GET_CONTEXT)		|
 		(1ull << IB_USER_VERBS_CMD_QUERY_DEVICE)	|
@@ -526,13 +528,13 @@ static int ehca_destroy_aqp1(struct ehca_sport *sport)
 
 	ret = ib_destroy_qp(sport->ibqp_aqp1);
 	if (ret) {
-		ehca_gen_err("Cannot destroy AQP1 QP. ret=%x", ret);
+		ehca_gen_err("Cannot destroy AQP1 QP. ret=%i", ret);
 		return ret;
 	}
 
 	ret = ib_destroy_cq(sport->ibcq_aqp1);
 	if (ret)
-		ehca_gen_err("Cannot destroy AQP1 CQ. ret=%x", ret);
+		ehca_gen_err("Cannot destroy AQP1 CQ. ret=%i", ret);
 
 	return ret;
 }
@@ -728,7 +730,7 @@ static int __devinit ehca_probe(struct ibmebus_dev *dev,
 	ret = ehca_reg_internal_maxmr(shca, shca->pd, &shca->maxmr);
 
 	if (ret) {
-		ehca_err(&shca->ib_device, "Cannot create internal MR ret=%x",
+		ehca_err(&shca->ib_device, "Cannot create internal MR ret=%i",
 			 ret);
 		goto probe5;
 	}
@@ -736,7 +738,7 @@ static int __devinit ehca_probe(struct ibmebus_dev *dev,
 	ret = ib_register_device(&shca->ib_device);
 	if (ret) {
 		ehca_err(&shca->ib_device,
-			 "ib_register_device() failed ret=%x", ret);
+			 "ib_register_device() failed ret=%i", ret);
 		goto probe6;
 	}
 
@@ -777,7 +779,7 @@ probe8:
 	ret = ehca_destroy_aqp1(&shca->sport[0]);
 	if (ret)
 		ehca_err(&shca->ib_device,
-			 "Cannot destroy AQP1 for port 1. ret=%x", ret);
+			 "Cannot destroy AQP1 for port 1. ret=%i", ret);
 
 probe7:
 	ib_unregister_device(&shca->ib_device);
@@ -826,7 +828,7 @@ static int __devexit ehca_remove(struct ibmebus_dev *dev)
 			if (ret)
 				ehca_err(&shca->ib_device,
 					 "Cannot destroy AQP1 for port %x "
-					 "ret=%x", ret, i);
+					 "ret=%i", ret, i);
 		}
 	}
 
@@ -835,20 +837,20 @@ static int __devexit ehca_remove(struct ibmebus_dev *dev)
 	ret = ehca_dereg_internal_maxmr(shca);
 	if (ret)
 		ehca_err(&shca->ib_device,
-			 "Cannot destroy internal MR. ret=%x", ret);
+			 "Cannot destroy internal MR. ret=%i", ret);
 
 	ret = ehca_dealloc_pd(&shca->pd->ib_pd);
 	if (ret)
 		ehca_err(&shca->ib_device,
-			 "Cannot destroy internal PD. ret=%x", ret);
+			 "Cannot destroy internal PD. ret=%i", ret);
 
 	ret = ehca_destroy_eq(shca, &shca->eq);
 	if (ret)
-		ehca_err(&shca->ib_device, "Cannot destroy EQ. ret=%x", ret);
+		ehca_err(&shca->ib_device, "Cannot destroy EQ. ret=%i", ret);
 
 	ret = ehca_destroy_eq(shca, &shca->neq);
 	if (ret)
-		ehca_err(&shca->ib_device, "Canot destroy NEQ. ret=%x", ret);
+		ehca_err(&shca->ib_device, "Canot destroy NEQ. ret=%i", ret);
 
 	ib_dealloc_device(&shca->ib_device);
 
@@ -909,7 +911,7 @@ int __init ehca_module_init(void)
 	int ret;
 
 	printk(KERN_INFO "eHCA Infiniband Device Driver "
-	       "(Rel.: SVNEHCA_0023)\n");
+	       "(Version " HCAD_VERSION ")\n");
 
 	ret = ehca_create_comp_pool();
 	if (ret) {
diff --git a/drivers/infiniband/hw/ehca/ehca_mcast.c b/drivers/infiniband/hw/ehca/ehca_mcast.c
index 32a870660bfe..e3ef0264ccc6 100644
--- a/drivers/infiniband/hw/ehca/ehca_mcast.c
+++ b/drivers/infiniband/hw/ehca/ehca_mcast.c
@@ -88,7 +88,7 @@ int ehca_attach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 	if (h_ret != H_SUCCESS)
 		ehca_err(ibqp->device,
 			 "ehca_qp=%p qp_num=%x hipz_h_attach_mcqp() failed "
-			 "h_ret=%lx", my_qp, ibqp->qp_num, h_ret);
+			 "h_ret=%li", my_qp, ibqp->qp_num, h_ret);
 
 	return ehca2ib_return_code(h_ret);
 }
@@ -125,7 +125,7 @@ int ehca_detach_mcast(struct ib_qp *ibqp, union ib_gid *gid, u16 lid)
 	if (h_ret != H_SUCCESS)
 		ehca_err(ibqp->device,
 			 "ehca_qp=%p qp_num=%x hipz_h_detach_mcqp() failed "
-			 "h_ret=%lx", my_qp, ibqp->qp_num, h_ret);
+			 "h_ret=%li", my_qp, ibqp->qp_num, h_ret);
 
 	return ehca2ib_return_code(h_ret);
 }
diff --git a/drivers/infiniband/hw/ehca/ehca_mrmw.c b/drivers/infiniband/hw/ehca/ehca_mrmw.c
index d97eda3e1da0..da88738265ed 100644
--- a/drivers/infiniband/hw/ehca/ehca_mrmw.c
+++ b/drivers/infiniband/hw/ehca/ehca_mrmw.c
@@ -51,6 +51,7 @@
 
 #define NUM_CHUNKS(length, chunk_size) \
 	(((length) + (chunk_size - 1)) / (chunk_size))
+
 /* max number of rpages (per hcall register_rpages) */
 #define MAX_RPAGES 512
 
@@ -64,6 +65,11 @@ enum ehca_mr_pgsize {
 	EHCA_MR_PGSIZE16M = 0x1000000L
 };
 
+#define EHCA_MR_PGSHIFT4K  12
+#define EHCA_MR_PGSHIFT64K 16
+#define EHCA_MR_PGSHIFT1M  20
+#define EHCA_MR_PGSHIFT16M 24
+
 static u32 ehca_encode_hwpage_size(u32 pgsize)
 {
 	u32 idx = 0;
@@ -159,7 +165,7 @@ struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags)
 
 get_dma_mr_exit0:
 	if (IS_ERR(ib_mr))
-		ehca_err(&shca->ib_device, "rc=%lx pd=%p mr_access_flags=%x ",
+		ehca_err(&shca->ib_device, "h_ret=%li pd=%p mr_access_flags=%x",
 			 PTR_ERR(ib_mr), pd, mr_access_flags);
 	return ib_mr;
 } /* end ehca_get_dma_mr() */
@@ -271,7 +277,7 @@ reg_phys_mr_exit1:
 	ehca_mr_delete(e_mr);
 reg_phys_mr_exit0:
 	if (IS_ERR(ib_mr))
-		ehca_err(pd->device, "rc=%lx pd=%p phys_buf_array=%p "
+		ehca_err(pd->device, "h_ret=%li pd=%p phys_buf_array=%p "
 			 "num_phys_buf=%x mr_access_flags=%x iova_start=%p",
 			 PTR_ERR(ib_mr), pd, phys_buf_array,
 			 num_phys_buf, mr_access_flags, iova_start);
@@ -347,17 +353,16 @@ struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length,
 	/* select proper hw_pgsize */
 	if (ehca_mr_largepage &&
 	    (shca->hca_cap_mr_pgsize & HCA_CAP_MR_PGSIZE_16M)) {
-		if (length <= EHCA_MR_PGSIZE4K
-		    && PAGE_SIZE == EHCA_MR_PGSIZE4K)
-			hwpage_size = EHCA_MR_PGSIZE4K;
-		else if (length <= EHCA_MR_PGSIZE64K)
-			hwpage_size = EHCA_MR_PGSIZE64K;
-		else if (length <= EHCA_MR_PGSIZE1M)
-			hwpage_size = EHCA_MR_PGSIZE1M;
-		else
-			hwpage_size = EHCA_MR_PGSIZE16M;
+		int page_shift = PAGE_SHIFT;
+		if (e_mr->umem->hugetlb) {
+			/* determine page_shift, clamp between 4K and 16M */
+			page_shift = (fls64(length - 1) + 3) & ~3;
+			page_shift = min(max(page_shift, EHCA_MR_PGSHIFT4K),
+					 EHCA_MR_PGSHIFT16M);
+		}
+		hwpage_size = 1UL << page_shift;
 	} else
-		hwpage_size = EHCA_MR_PGSIZE4K;
+		hwpage_size = EHCA_MR_PGSIZE4K; /* ehca1 only supports 4k */
 	ehca_dbg(pd->device, "hwpage_size=%lx", hwpage_size);
 
 reg_user_mr_fallback:
@@ -403,8 +408,7 @@ reg_user_mr_exit1:
 	ehca_mr_delete(e_mr);
 reg_user_mr_exit0:
 	if (IS_ERR(ib_mr))
-		ehca_err(pd->device, "rc=%lx pd=%p mr_access_flags=%x"
-			 " udata=%p",
+		ehca_err(pd->device, "rc=%li pd=%p mr_access_flags=%x udata=%p",
 			 PTR_ERR(ib_mr), pd, mr_access_flags, udata);
 	return ib_mr;
 } /* end ehca_reg_user_mr() */
@@ -565,7 +569,7 @@ rereg_phys_mr_exit1:
 	spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
 rereg_phys_mr_exit0:
 	if (ret)
-		ehca_err(mr->device, "ret=%x mr=%p mr_rereg_mask=%x pd=%p "
+		ehca_err(mr->device, "ret=%i mr=%p mr_rereg_mask=%x pd=%p "
 			 "phys_buf_array=%p num_phys_buf=%x mr_access_flags=%x "
 			 "iova_start=%p",
 			 ret, mr, mr_rereg_mask, pd, phys_buf_array,
@@ -607,7 +611,7 @@ int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr)
 
 	h_ret = hipz_h_query_mr(shca->ipz_hca_handle, e_mr, &hipzout);
 	if (h_ret != H_SUCCESS) {
-		ehca_err(mr->device, "hipz_mr_query failed, h_ret=%lx mr=%p "
+		ehca_err(mr->device, "hipz_mr_query failed, h_ret=%li mr=%p "
 			 "hca_hndl=%lx mr_hndl=%lx lkey=%x",
 			 h_ret, mr, shca->ipz_hca_handle.handle,
 			 e_mr->ipz_mr_handle.handle, mr->lkey);
@@ -625,7 +629,7 @@ query_mr_exit1:
 	spin_unlock_irqrestore(&e_mr->mrlock, sl_flags);
 query_mr_exit0:
 	if (ret)
-		ehca_err(mr->device, "ret=%x mr=%p mr_attr=%p",
+		ehca_err(mr->device, "ret=%i mr=%p mr_attr=%p",
 			 ret, mr, mr_attr);
 	return ret;
 } /* end ehca_query_mr() */
@@ -667,7 +671,7 @@ int ehca_dereg_mr(struct ib_mr *mr)
 	/* TODO: BUSY: MR still has bound window(s) */
 	h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
 	if (h_ret != H_SUCCESS) {
-		ehca_err(mr->device, "hipz_free_mr failed, h_ret=%lx shca=%p "
+		ehca_err(mr->device, "hipz_free_mr failed, h_ret=%li shca=%p "
 			 "e_mr=%p hca_hndl=%lx mr_hndl=%lx mr->lkey=%x",
 			 h_ret, shca, e_mr, shca->ipz_hca_handle.handle,
 			 e_mr->ipz_mr_handle.handle, mr->lkey);
@@ -683,7 +687,7 @@ int ehca_dereg_mr(struct ib_mr *mr)
 
 dereg_mr_exit0:
 	if (ret)
-		ehca_err(mr->device, "ret=%x mr=%p", ret, mr);
+		ehca_err(mr->device, "ret=%i mr=%p", ret, mr);
 	return ret;
 } /* end ehca_dereg_mr() */
 
@@ -708,7 +712,7 @@ struct ib_mw *ehca_alloc_mw(struct ib_pd *pd)
 	h_ret = hipz_h_alloc_resource_mw(shca->ipz_hca_handle, e_mw,
 					 e_pd->fw_pd, &hipzout);
 	if (h_ret != H_SUCCESS) {
-		ehca_err(pd->device, "hipz_mw_allocate failed, h_ret=%lx "
+		ehca_err(pd->device, "hipz_mw_allocate failed, h_ret=%li "
 			 "shca=%p hca_hndl=%lx mw=%p",
 			 h_ret, shca, shca->ipz_hca_handle.handle, e_mw);
 		ib_mw = ERR_PTR(ehca2ib_return_code(h_ret));
@@ -723,7 +727,7 @@ alloc_mw_exit1:
 	ehca_mw_delete(e_mw);
 alloc_mw_exit0:
 	if (IS_ERR(ib_mw))
-		ehca_err(pd->device, "rc=%lx pd=%p", PTR_ERR(ib_mw), pd);
+		ehca_err(pd->device, "h_ret=%li pd=%p", PTR_ERR(ib_mw), pd);
 	return ib_mw;
 } /* end ehca_alloc_mw() */
 
@@ -750,7 +754,7 @@ int ehca_dealloc_mw(struct ib_mw *mw)
 
 	h_ret = hipz_h_free_resource_mw(shca->ipz_hca_handle, e_mw);
 	if (h_ret != H_SUCCESS) {
-		ehca_err(mw->device, "hipz_free_mw failed, h_ret=%lx shca=%p "
+		ehca_err(mw->device, "hipz_free_mw failed, h_ret=%li shca=%p "
 			 "mw=%p rkey=%x hca_hndl=%lx mw_hndl=%lx",
 			 h_ret, shca, mw, mw->rkey, shca->ipz_hca_handle.handle,
 			 e_mw->ipz_mw_handle.handle);
@@ -846,10 +850,6 @@ struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd,
 alloc_fmr_exit1:
 	ehca_mr_delete(e_fmr);
 alloc_fmr_exit0:
-	if (IS_ERR(ib_fmr))
-		ehca_err(pd->device, "rc=%lx pd=%p mr_access_flags=%x "
-			 "fmr_attr=%p", PTR_ERR(ib_fmr), pd,
-			 mr_access_flags, fmr_attr);
 	return ib_fmr;
 } /* end ehca_alloc_fmr() */
 
@@ -916,7 +916,7 @@ int ehca_map_phys_fmr(struct ib_fmr *fmr,
 
 map_phys_fmr_exit0:
 	if (ret)
-		ehca_err(fmr->device, "ret=%x fmr=%p page_list=%p list_len=%x "
+		ehca_err(fmr->device, "ret=%i fmr=%p page_list=%p list_len=%x "
 			 "iova=%lx", ret, fmr, page_list, list_len, iova);
 	return ret;
 } /* end ehca_map_phys_fmr() */
@@ -979,7 +979,7 @@ int ehca_unmap_fmr(struct list_head *fmr_list)
 
 unmap_fmr_exit0:
 	if (ret)
-		ehca_gen_err("ret=%x fmr_list=%p num_fmr=%x unmap_fmr_cnt=%x",
+		ehca_gen_err("ret=%i fmr_list=%p num_fmr=%x unmap_fmr_cnt=%x",
 			     ret, fmr_list, num_fmr, unmap_fmr_cnt);
 	return ret;
 } /* end ehca_unmap_fmr() */
@@ -1003,7 +1003,7 @@ int ehca_dealloc_fmr(struct ib_fmr *fmr)
 
 	h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr);
 	if (h_ret != H_SUCCESS) {
-		ehca_err(fmr->device, "hipz_free_mr failed, h_ret=%lx e_fmr=%p "
+		ehca_err(fmr->device, "hipz_free_mr failed, h_ret=%li e_fmr=%p "
 			 "hca_hndl=%lx fmr_hndl=%lx fmr->lkey=%x",
 			 h_ret, e_fmr, shca->ipz_hca_handle.handle,
 			 e_fmr->ipz_mr_handle.handle, fmr->lkey);
@@ -1016,7 +1016,7 @@ int ehca_dealloc_fmr(struct ib_fmr *fmr)
 
 free_fmr_exit0:
 	if (ret)
-		ehca_err(&shca->ib_device, "ret=%x fmr=%p", ret, fmr);
+		ehca_err(&shca->ib_device, "ret=%i fmr=%p", ret, fmr);
 	return ret;
 } /* end ehca_dealloc_fmr() */
 
@@ -1046,7 +1046,7 @@ int ehca_reg_mr(struct ehca_shca *shca,
 					 (u64)iova_start, size, hipz_acl,
 					 e_pd->fw_pd, &hipzout);
 	if (h_ret != H_SUCCESS) {
-		ehca_err(&shca->ib_device, "hipz_alloc_mr failed, h_ret=%lx "
+		ehca_err(&shca->ib_device, "hipz_alloc_mr failed, h_ret=%li "
 			 "hca_hndl=%lx", h_ret, shca->ipz_hca_handle.handle);
 		ret = ehca2ib_return_code(h_ret);
 		goto ehca_reg_mr_exit0;
@@ -1072,9 +1072,9 @@ int ehca_reg_mr(struct ehca_shca *shca,
 ehca_reg_mr_exit1:
 	h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
 	if (h_ret != H_SUCCESS) {
-		ehca_err(&shca->ib_device, "h_ret=%lx shca=%p e_mr=%p "
+		ehca_err(&shca->ib_device, "h_ret=%li shca=%p e_mr=%p "
 			 "iova_start=%p size=%lx acl=%x e_pd=%p lkey=%x "
-			 "pginfo=%p num_kpages=%lx num_hwpages=%lx ret=%x",
+			 "pginfo=%p num_kpages=%lx num_hwpages=%lx ret=%i",
 			 h_ret, shca, e_mr, iova_start, size, acl, e_pd,
 			 hipzout.lkey, pginfo, pginfo->num_kpages,
 			 pginfo->num_hwpages, ret);
@@ -1083,7 +1083,7 @@ ehca_reg_mr_exit1:
 	}
 ehca_reg_mr_exit0:
 	if (ret)
-		ehca_err(&shca->ib_device, "ret=%x shca=%p e_mr=%p "
+		ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p "
 			 "iova_start=%p size=%lx acl=%x e_pd=%p pginfo=%p "
 			 "num_kpages=%lx num_hwpages=%lx",
 			 ret, shca, e_mr, iova_start, size, acl, e_pd, pginfo,
@@ -1127,7 +1127,7 @@ int ehca_reg_mr_rpages(struct ehca_shca *shca,
 		ret = ehca_set_pagebuf(pginfo, rnum, kpage);
 		if (ret) {
 			ehca_err(&shca->ib_device, "ehca_set_pagebuf "
-				 "bad rc, ret=%x rnum=%x kpage=%p",
+				 "bad rc, ret=%i rnum=%x kpage=%p",
 				 ret, rnum, kpage);
 			goto ehca_reg_mr_rpages_exit1;
 		}
@@ -1155,7 +1155,7 @@ int ehca_reg_mr_rpages(struct ehca_shca *shca,
 			 */
 			if (h_ret != H_SUCCESS) {
 				ehca_err(&shca->ib_device, "last "
-					 "hipz_reg_rpage_mr failed, h_ret=%lx "
+					 "hipz_reg_rpage_mr failed, h_ret=%li "
 					 "e_mr=%p i=%x hca_hndl=%lx mr_hndl=%lx"
 					 " lkey=%x", h_ret, e_mr, i,
 					 shca->ipz_hca_handle.handle,
@@ -1167,7 +1167,7 @@ int ehca_reg_mr_rpages(struct ehca_shca *shca,
 				ret = 0;
 		} else if (h_ret != H_PAGE_REGISTERED) {
 			ehca_err(&shca->ib_device, "hipz_reg_rpage_mr failed, "
-				 "h_ret=%lx e_mr=%p i=%x lkey=%x hca_hndl=%lx "
+				 "h_ret=%li e_mr=%p i=%x lkey=%x hca_hndl=%lx "
 				 "mr_hndl=%lx", h_ret, e_mr, i,
 				 e_mr->ib.ib_mr.lkey,
 				 shca->ipz_hca_handle.handle,
@@ -1183,7 +1183,7 @@ ehca_reg_mr_rpages_exit1:
 	ehca_free_fw_ctrlblock(kpage);
 ehca_reg_mr_rpages_exit0:
 	if (ret)
-		ehca_err(&shca->ib_device, "ret=%x shca=%p e_mr=%p pginfo=%p "
+		ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p pginfo=%p "
 			 "num_kpages=%lx num_hwpages=%lx", ret, shca, e_mr,
 			 pginfo, pginfo->num_kpages, pginfo->num_hwpages);
 	return ret;
@@ -1244,7 +1244,7 @@ inline int ehca_rereg_mr_rereg1(struct ehca_shca *shca,
 		 * (MW bound or MR is shared)
 		 */
 		ehca_warn(&shca->ib_device, "hipz_h_reregister_pmr failed "
-			  "(Rereg1), h_ret=%lx e_mr=%p", h_ret, e_mr);
+			  "(Rereg1), h_ret=%li e_mr=%p", h_ret, e_mr);
 		*pginfo = pginfo_save;
 		ret = -EAGAIN;
 	} else if ((u64 *)hipzout.vaddr != iova_start) {
@@ -1273,7 +1273,7 @@ ehca_rereg_mr_rereg1_exit1:
 	ehca_free_fw_ctrlblock(kpage);
 ehca_rereg_mr_rereg1_exit0:
 	if ( ret && (ret != -EAGAIN) )
-		ehca_err(&shca->ib_device, "ret=%x lkey=%x rkey=%x "
+		ehca_err(&shca->ib_device, "ret=%i lkey=%x rkey=%x "
 			 "pginfo=%p num_kpages=%lx num_hwpages=%lx",
 			 ret, *lkey, *rkey, pginfo, pginfo->num_kpages,
 			 pginfo->num_hwpages);
@@ -1334,7 +1334,7 @@ int ehca_rereg_mr(struct ehca_shca *shca,
 		h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_mr);
 		if (h_ret != H_SUCCESS) {
 			ehca_err(&shca->ib_device, "hipz_free_mr failed, "
-				 "h_ret=%lx e_mr=%p hca_hndl=%lx mr_hndl=%lx "
+				 "h_ret=%li e_mr=%p hca_hndl=%lx mr_hndl=%lx "
 				 "mr->lkey=%x",
 				 h_ret, e_mr, shca->ipz_hca_handle.handle,
 				 e_mr->ipz_mr_handle.handle,
@@ -1366,7 +1366,7 @@ int ehca_rereg_mr(struct ehca_shca *shca,
 
 ehca_rereg_mr_exit0:
 	if (ret)
-		ehca_err(&shca->ib_device, "ret=%x shca=%p e_mr=%p "
+		ehca_err(&shca->ib_device, "ret=%i shca=%p e_mr=%p "
 			 "iova_start=%p size=%lx acl=%x e_pd=%p pginfo=%p "
 			 "num_kpages=%lx lkey=%x rkey=%x rereg_1_hcall=%x "
 			 "rereg_3_hcall=%x", ret, shca, e_mr, iova_start, size,
@@ -1410,7 +1410,7 @@ int ehca_unmap_one_fmr(struct ehca_shca *shca,
 		 * FMRs are not shared and no MW bound to FMRs
 		 */
 		ehca_err(&shca->ib_device, "hipz_reregister_pmr failed "
-			 "(Rereg1), h_ret=%lx e_fmr=%p hca_hndl=%lx "
+			 "(Rereg1), h_ret=%li e_fmr=%p hca_hndl=%lx "
 			 "mr_hndl=%lx lkey=%x lkey_out=%x",
 			 h_ret, e_fmr, shca->ipz_hca_handle.handle,
 			 e_fmr->ipz_mr_handle.handle,
@@ -1422,7 +1422,7 @@ int ehca_unmap_one_fmr(struct ehca_shca *shca,
 	h_ret = hipz_h_free_resource_mr(shca->ipz_hca_handle, e_fmr);
 	if (h_ret != H_SUCCESS) {
 		ehca_err(&shca->ib_device, "hipz_free_mr failed, "
-			 "h_ret=%lx e_fmr=%p hca_hndl=%lx mr_hndl=%lx "
+			 "h_ret=%li e_fmr=%p hca_hndl=%lx mr_hndl=%lx "
 			 "lkey=%x",
 			 h_ret, e_fmr, shca->ipz_hca_handle.handle,
 			 e_fmr->ipz_mr_handle.handle,
@@ -1457,7 +1457,7 @@ int ehca_unmap_one_fmr(struct ehca_shca *shca,
 
 ehca_unmap_one_fmr_exit0:
 	if (ret)
-		ehca_err(&shca->ib_device, "ret=%x tmp_lkey=%x tmp_rkey=%x "
+		ehca_err(&shca->ib_device, "ret=%i tmp_lkey=%x tmp_rkey=%x "
 			 "fmr_max_pages=%x",
 			 ret, tmp_lkey, tmp_rkey, e_fmr->fmr_max_pages);
 	return ret;
@@ -1486,7 +1486,7 @@ int ehca_reg_smr(struct ehca_shca *shca,
 				    (u64)iova_start, hipz_acl, e_pd->fw_pd,
 				    &hipzout);
 	if (h_ret != H_SUCCESS) {
-		ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lx "
+		ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%li "
 			 "shca=%p e_origmr=%p e_newmr=%p iova_start=%p acl=%x "
 			 "e_pd=%p hca_hndl=%lx mr_hndl=%lx lkey=%x",
 			 h_ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd,
@@ -1510,7 +1510,7 @@ int ehca_reg_smr(struct ehca_shca *shca,
 
 ehca_reg_smr_exit0:
 	if (ret)
-		ehca_err(&shca->ib_device, "ret=%x shca=%p e_origmr=%p "
+		ehca_err(&shca->ib_device, "ret=%i shca=%p e_origmr=%p "
 			 "e_newmr=%p iova_start=%p acl=%x e_pd=%p",
 			 ret, shca, e_origmr, e_newmr, iova_start, acl, e_pd);
 	return ret;
@@ -1585,7 +1585,7 @@ ehca_reg_internal_maxmr_exit1:
 	ehca_mr_delete(e_mr);
 ehca_reg_internal_maxmr_exit0:
 	if (ret)
-		ehca_err(&shca->ib_device, "ret=%x shca=%p e_pd=%p e_maxmr=%p",
+		ehca_err(&shca->ib_device, "ret=%i shca=%p e_pd=%p e_maxmr=%p",
 			 ret, shca, e_pd, e_maxmr);
 	return ret;
 } /* end ehca_reg_internal_maxmr() */
@@ -1612,7 +1612,7 @@ int ehca_reg_maxmr(struct ehca_shca *shca,
 				    (u64)iova_start, hipz_acl, e_pd->fw_pd,
 				    &hipzout);
 	if (h_ret != H_SUCCESS) {
-		ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%lx "
+		ehca_err(&shca->ib_device, "hipz_reg_smr failed, h_ret=%li "
 			 "e_origmr=%p hca_hndl=%lx mr_hndl=%lx lkey=%x",
 			 h_ret, e_origmr, shca->ipz_hca_handle.handle,
 			 e_origmr->ipz_mr_handle.handle,
@@ -1653,7 +1653,7 @@ int ehca_dereg_internal_maxmr(struct ehca_shca *shca)
 	ret = ehca_dereg_mr(&e_maxmr->ib.ib_mr);
 	if (ret) {
 		ehca_err(&shca->ib_device, "dereg internal max-MR failed, "
-			 "ret=%x e_maxmr=%p shca=%p lkey=%x",
+			 "ret=%i e_maxmr=%p shca=%p lkey=%x",
 			 ret, e_maxmr, shca, e_maxmr->ib.ib_mr.lkey);
 		shca->maxmr = e_maxmr;
 		goto ehca_dereg_internal_maxmr_exit0;
@@ -1663,7 +1663,7 @@ int ehca_dereg_internal_maxmr(struct ehca_shca *shca)
 
 ehca_dereg_internal_maxmr_exit0:
 	if (ret)
-		ehca_err(&shca->ib_device, "ret=%x shca=%p shca->maxmr=%p",
+		ehca_err(&shca->ib_device, "ret=%i shca=%p shca->maxmr=%p",
 			 ret, shca, shca->maxmr);
 	return ret;
 } /* end ehca_dereg_internal_maxmr() */
diff --git a/drivers/infiniband/hw/ehca/ehca_qp.c b/drivers/infiniband/hw/ehca/ehca_qp.c
index 84d435a5ee11..e2bd62be11e7 100644
--- a/drivers/infiniband/hw/ehca/ehca_qp.c
+++ b/drivers/infiniband/hw/ehca/ehca_qp.c
@@ -273,6 +273,7 @@ static inline void queue2resp(struct ipzu_queue_resp *resp,
 	resp->queue_length = queue->queue_length;
 	resp->pagesize = queue->pagesize;
 	resp->toggle_state = queue->toggle_state;
+	resp->offset = queue->offset;
 }
 
 /*
@@ -309,7 +310,7 @@ static inline int init_qp_queue(struct ehca_shca *shca,
 	}
 
 	if (!ipz_rc) {
-		ehca_err(ib_dev, "Cannot allocate page for queue. ipz_rc=%x",
+		ehca_err(ib_dev, "Cannot allocate page for queue. ipz_rc=%i",
 			 ipz_rc);
 		return -EBUSY;
 	}
@@ -333,7 +334,7 @@ static inline int init_qp_queue(struct ehca_shca *shca,
 		if (cnt == (nr_q_pages - 1)) {	/* last page! */
 			if (h_ret != expected_hret) {
 				ehca_err(ib_dev, "hipz_qp_register_rpage() "
-					 "h_ret= %lx ", h_ret);
+					 "h_ret=%li", h_ret);
 				ret = ehca2ib_return_code(h_ret);
 				goto init_qp_queue1;
 			}
@@ -347,7 +348,7 @@ static inline int init_qp_queue(struct ehca_shca *shca,
 		} else {
 			if (h_ret != H_PAGE_REGISTERED) {
 				ehca_err(ib_dev, "hipz_qp_register_rpage() "
-					 "h_ret= %lx ", h_ret);
+					 "h_ret=%li", h_ret);
 				ret = ehca2ib_return_code(h_ret);
 				goto init_qp_queue1;
 			}
@@ -512,7 +513,7 @@ static struct ehca_qp *internal_create_qp(
 			} else if (init_attr->cap.max_send_wr > 255) {
 				ehca_err(pd->device,
 					 "Invalid Number of "
-					 "ax_send_wr=%x for UD QP_TYPE=%x",
+					 "max_send_wr=%x for UD QP_TYPE=%x",
 					 init_attr->cap.max_send_wr, qp_type);
 				return ERR_PTR(-EINVAL);
 			}
@@ -523,6 +524,18 @@ static struct ehca_qp *internal_create_qp(
 			return ERR_PTR(-EINVAL);
 			break;
 		}
+	} else {
+		int max_sge = (qp_type == IB_QPT_UD || qp_type == IB_QPT_SMI
+			       || qp_type == IB_QPT_GSI) ? 250 : 252;
+
+		if (init_attr->cap.max_send_sge > max_sge
+		    || init_attr->cap.max_recv_sge > max_sge) {
+			ehca_err(pd->device, "Invalid number of SGEs requested "
+				 "send_sge=%x recv_sge=%x max_sge=%x",
+				 init_attr->cap.max_send_sge,
+				 init_attr->cap.max_recv_sge, max_sge);
+			return ERR_PTR(-EINVAL);
+		}
 	}
 
 	if (pd->uobject && udata)
@@ -556,7 +569,6 @@ static struct ehca_qp *internal_create_qp(
 		write_lock_irqsave(&ehca_qp_idr_lock, flags);
 		ret = idr_get_new(&ehca_qp_idr, my_qp, &my_qp->token);
 		write_unlock_irqrestore(&ehca_qp_idr_lock, flags);
-
 	} while (ret == -EAGAIN);
 
 	if (ret) {
@@ -565,11 +577,17 @@ static struct ehca_qp *internal_create_qp(
 		goto create_qp_exit0;
 	}
 
+	if (my_qp->token > 0x1FFFFFF) {
+		ret = -EINVAL;
+		ehca_err(pd->device, "Invalid number of qp");
+		goto create_qp_exit1;
+	}
+
 	parms.servicetype = ibqptype2servicetype(qp_type);
 	if (parms.servicetype < 0) {
 		ret = -EINVAL;
 		ehca_err(pd->device, "Invalid qp_type=%x", qp_type);
-		goto create_qp_exit0;
+		goto create_qp_exit1;
 	}
 
 	if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR)
@@ -598,8 +616,7 @@ static struct ehca_qp *internal_create_qp(
 	parms.squeue.max_sge = max_send_sge;
 	parms.rqueue.max_sge = max_recv_sge;
 
-	if (EHCA_BMASK_GET(HCA_CAP_MINI_QP, shca->hca_cap)
-	    && !(context && udata)) { /* no small QP support in userspace ATM */
+	if (EHCA_BMASK_GET(HCA_CAP_MINI_QP, shca->hca_cap)) {
 		if (HAS_SQ(my_qp))
 			ehca_determine_small_queue(
 				&parms.squeue, max_send_sge, is_llqp);
@@ -612,7 +629,7 @@ static struct ehca_qp *internal_create_qp(
 
 	h_ret = hipz_h_alloc_resource_qp(shca->ipz_hca_handle, &parms);
 	if (h_ret != H_SUCCESS) {
-		ehca_err(pd->device, "h_alloc_resource_qp() failed h_ret=%lx",
+		ehca_err(pd->device, "h_alloc_resource_qp() failed h_ret=%li",
 			 h_ret);
 		ret = ehca2ib_return_code(h_ret);
 		goto create_qp_exit1;
@@ -666,7 +683,7 @@ static struct ehca_qp *internal_create_qp(
 			&parms.squeue, swqe_size);
 		if (ret) {
 			ehca_err(pd->device, "Couldn't initialize squeue "
-				 "and pages  ret=%x", ret);
+				 "and pages ret=%i", ret);
 			goto create_qp_exit2;
 		}
 	}
@@ -677,7 +694,7 @@ static struct ehca_qp *internal_create_qp(
 			H_SUCCESS, &parms.rqueue, rwqe_size);
 		if (ret) {
 			ehca_err(pd->device, "Couldn't initialize rqueue "
-				 "and pages ret=%x", ret);
+				 "and pages ret=%i", ret);
 			goto create_qp_exit3;
 		}
 	}
@@ -714,8 +731,6 @@ static struct ehca_qp *internal_create_qp(
 	if (qp_type == IB_QPT_GSI) {
 		h_ret = ehca_define_sqp(shca, my_qp, init_attr);
 		if (h_ret != H_SUCCESS) {
-			ehca_err(pd->device, "ehca_define_sqp() failed rc=%lx",
-				 h_ret);
 			ret = ehca2ib_return_code(h_ret);
 			goto create_qp_exit4;
 		}
@@ -725,7 +740,7 @@ static struct ehca_qp *internal_create_qp(
 		ret = ehca_cq_assign_qp(my_qp->send_cq, my_qp);
 		if (ret) {
 			ehca_err(pd->device,
-				 "Couldn't assign qp to send_cq ret=%x", ret);
+				 "Couldn't assign qp to send_cq ret=%i", ret);
 			goto create_qp_exit4;
 		}
 	}
@@ -741,12 +756,13 @@ static struct ehca_qp *internal_create_qp(
 		resp.ext_type = my_qp->ext_type;
 		resp.qkey = my_qp->qkey;
 		resp.real_qp_num = my_qp->real_qp_num;
-		resp.ipz_rqueue.offset = my_qp->ipz_rqueue.offset;
-		resp.ipz_squeue.offset = my_qp->ipz_squeue.offset;
+
 		if (HAS_SQ(my_qp))
 			queue2resp(&resp.ipz_squeue, &my_qp->ipz_squeue);
 		if (HAS_RQ(my_qp))
 			queue2resp(&resp.ipz_rqueue, &my_qp->ipz_rqueue);
+		resp.fw_handle_ofs = (u32)
+			(my_qp->galpas.user.fw_handle & (PAGE_SIZE - 1));
 
 		if (ib_copy_to_udata(udata, &resp, sizeof resp)) {
 			ehca_err(pd->device, "Copy to udata failed");
@@ -841,7 +857,7 @@ struct ib_srq *ehca_create_srq(struct ib_pd *pd,
 				mqpcb, my_qp->galpas.kernel);
 	if (hret != H_SUCCESS) {
 		ehca_err(pd->device, "Could not modify SRQ to INIT"
-			 "ehca_qp=%p qp_num=%x hret=%lx",
+			 "ehca_qp=%p qp_num=%x h_ret=%li",
 			 my_qp, my_qp->real_qp_num, hret);
 		goto create_srq2;
 	}
@@ -855,7 +871,7 @@ struct ib_srq *ehca_create_srq(struct ib_pd *pd,
 				mqpcb, my_qp->galpas.kernel);
 	if (hret != H_SUCCESS) {
 		ehca_err(pd->device, "Could not enable SRQ"
-			 "ehca_qp=%p qp_num=%x hret=%lx",
+			 "ehca_qp=%p qp_num=%x h_ret=%li",
 			 my_qp, my_qp->real_qp_num, hret);
 		goto create_srq2;
 	}
@@ -869,11 +885,13 @@ struct ib_srq *ehca_create_srq(struct ib_pd *pd,
 				mqpcb, my_qp->galpas.kernel);
 	if (hret != H_SUCCESS) {
 		ehca_err(pd->device, "Could not modify SRQ to RTR"
-			 "ehca_qp=%p qp_num=%x hret=%lx",
+			 "ehca_qp=%p qp_num=%x h_ret=%li",
 			 my_qp, my_qp->real_qp_num, hret);
 		goto create_srq2;
 	}
 
+	ehca_free_fw_ctrlblock(mqpcb);
+
 	return &my_qp->ib_srq;
 
 create_srq2:
@@ -907,7 +925,7 @@ static int prepare_sqe_rts(struct ehca_qp *my_qp, struct ehca_shca *shca,
 					   &bad_send_wqe_p, NULL, 2);
 	if (h_ret != H_SUCCESS) {
 		ehca_err(&shca->ib_device, "hipz_h_disable_and_get_wqe() failed"
-			 " ehca_qp=%p qp_num=%x h_ret=%lx",
+			 " ehca_qp=%p qp_num=%x h_ret=%li",
 			 my_qp, qp_num, h_ret);
 		return ehca2ib_return_code(h_ret);
 	}
@@ -985,7 +1003,7 @@ static int internal_modify_qp(struct ib_qp *ibqp,
 				mqpcb, my_qp->galpas.kernel);
 	if (h_ret != H_SUCCESS) {
 		ehca_err(ibqp->device, "hipz_h_query_qp() failed "
-			 "ehca_qp=%p qp_num=%x h_ret=%lx",
+			 "ehca_qp=%p qp_num=%x h_ret=%li",
 			 my_qp, ibqp->qp_num, h_ret);
 		ret = ehca2ib_return_code(h_ret);
 		goto modify_qp_exit1;
@@ -1021,7 +1039,7 @@ static int internal_modify_qp(struct ib_qp *ibqp,
 			ibqp, &smiqp_attr, smiqp_attr_mask, 1);
 		if (smirc) {
 			ehca_err(ibqp->device, "SMI RESET -> INIT failed. "
-				 "ehca_modify_qp() rc=%x", smirc);
+				 "ehca_modify_qp() rc=%i", smirc);
 			ret = H_PARAMETER;
 			goto modify_qp_exit1;
 		}
@@ -1123,7 +1141,7 @@ static int internal_modify_qp(struct ib_qp *ibqp,
 		ret = prepare_sqe_rts(my_qp, shca, &bad_wqe_cnt);
 		if (ret) {
 			ehca_err(ibqp->device, "prepare_sqe_rts() failed "
-				 "ehca_qp=%p qp_num=%x ret=%x",
+				 "ehca_qp=%p qp_num=%x ret=%i",
 				 my_qp, ibqp->qp_num, ret);
 			goto modify_qp_exit2;
 		}
@@ -1149,6 +1167,13 @@ static int internal_modify_qp(struct ib_qp *ibqp,
 	}
 
 	if (attr_mask & IB_QP_PKEY_INDEX) {
+		if (attr->pkey_index >= 16) {
+			ret = -EINVAL;
+			ehca_err(ibqp->device, "Invalid pkey_index=%x. "
+				 "ehca_qp=%p qp_num=%x max_pkey_index=f",
+				 attr->pkey_index, my_qp, ibqp->qp_num);
+			goto modify_qp_exit2;
+		}
 		mqpcb->prim_p_key_idx = attr->pkey_index;
 		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_PRIM_P_KEY_IDX, 1);
 	}
@@ -1257,50 +1282,78 @@ static int internal_modify_qp(struct ib_qp *ibqp,
 		int ehca_mult = ib_rate_to_mult(
 			shca->sport[my_qp->init_attr.port_num].rate);
 
+		if (attr->alt_port_num < 1
+		    || attr->alt_port_num > shca->num_ports) {
+			ret = -EINVAL;
+			ehca_err(ibqp->device, "Invalid alt_port=%x. "
+				 "ehca_qp=%p qp_num=%x num_ports=%x",
+				 attr->alt_port_num, my_qp, ibqp->qp_num,
+				 shca->num_ports);
+			goto modify_qp_exit2;
+		}
+		mqpcb->alt_phys_port = attr->alt_port_num;
+
+		if (attr->alt_pkey_index >= 16) {
+			ret = -EINVAL;
+			ehca_err(ibqp->device, "Invalid alt_pkey_index=%x. "
+				 "ehca_qp=%p qp_num=%x max_pkey_index=f",
+				 attr->pkey_index, my_qp, ibqp->qp_num);
+			goto modify_qp_exit2;
+		}
+		mqpcb->alt_p_key_idx = attr->alt_pkey_index;
+
+		mqpcb->timeout_al = attr->alt_timeout;
 		mqpcb->dlid_al = attr->alt_ah_attr.dlid;
-		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_DLID_AL, 1);
 		mqpcb->source_path_bits_al = attr->alt_ah_attr.src_path_bits;
-		update_mask |=
-			EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS_AL, 1);
 		mqpcb->service_level_al = attr->alt_ah_attr.sl;
-		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL_AL, 1);
 
-		if (ah_mult < ehca_mult)
-			mqpcb->max_static_rate = (ah_mult > 0) ?
-			((ehca_mult - 1) / ah_mult) : 0;
+		if (ah_mult > 0 && ah_mult < ehca_mult)
+			mqpcb->max_static_rate_al = (ehca_mult - 1) / ah_mult;
 		else
 			mqpcb->max_static_rate_al = 0;
 
-		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE_AL, 1);
+		/* OpenIB doesn't support alternate retry counts - copy them */
+		mqpcb->retry_count_al = mqpcb->retry_count;
+		mqpcb->rnr_retry_count_al = mqpcb->rnr_retry_count;
+
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_ALT_PHYS_PORT, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_ALT_P_KEY_IDX, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_TIMEOUT_AL, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_DLID_AL, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_SOURCE_PATH_BITS_AL, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_SERVICE_LEVEL_AL, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_MAX_STATIC_RATE_AL, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_RETRY_COUNT_AL, 1)
+			| EHCA_BMASK_SET(MQPCB_MASK_RNR_RETRY_COUNT_AL, 1);
+
+		/*
+		 * Always supply the GRH flag, even if it's zero, to give the
+		 * hypervisor a clear "yes" or "no" instead of a "perhaps"
+		 */
+		update_mask |= EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG_AL, 1);
 
 		/*
 		 * only if GRH is TRUE we might consider SOURCE_GID_IDX
 		 * and DEST_GID otherwise phype will return H_ATTR_PARM!!!
 		 */
 		if (attr->alt_ah_attr.ah_flags == IB_AH_GRH) {
-			mqpcb->send_grh_flag_al = 1 << 31;
-			update_mask |=
-				EHCA_BMASK_SET(MQPCB_MASK_SEND_GRH_FLAG_AL, 1);
-			mqpcb->source_gid_idx_al =
-				attr->alt_ah_attr.grh.sgid_index;
-			update_mask |=
-				EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX_AL, 1);
+			mqpcb->send_grh_flag_al = 1;
 
 			for (cnt = 0; cnt < 16; cnt++)
 				mqpcb->dest_gid_al.byte[cnt] =
 					attr->alt_ah_attr.grh.dgid.raw[cnt];
-
-			update_mask |=
-				EHCA_BMASK_SET(MQPCB_MASK_DEST_GID_AL, 1);
+			mqpcb->source_gid_idx_al =
+				attr->alt_ah_attr.grh.sgid_index;
 			mqpcb->flow_label_al = attr->alt_ah_attr.grh.flow_label;
-			update_mask |=
-				EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL_AL, 1);
 			mqpcb->hop_limit_al = attr->alt_ah_attr.grh.hop_limit;
-			update_mask |=
-				EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT_AL, 1);
 			mqpcb->traffic_class_al =
 				attr->alt_ah_attr.grh.traffic_class;
+
 			update_mask |=
+				EHCA_BMASK_SET(MQPCB_MASK_SOURCE_GID_IDX_AL, 1)
+				| EHCA_BMASK_SET(MQPCB_MASK_DEST_GID_AL, 1)
+				| EHCA_BMASK_SET(MQPCB_MASK_FLOW_LABEL_AL, 1)
+				| EHCA_BMASK_SET(MQPCB_MASK_HOP_LIMIT_AL, 1) |
 				EHCA_BMASK_SET(MQPCB_MASK_TRAFFIC_CLASS_AL, 1);
 		}
 	}
@@ -1322,7 +1375,14 @@ static int internal_modify_qp(struct ib_qp *ibqp,
 	}
 
 	if (attr_mask & IB_QP_PATH_MIG_STATE) {
-		mqpcb->path_migration_state = attr->path_mig_state;
+		if (attr->path_mig_state != IB_MIG_REARM
+		    && attr->path_mig_state != IB_MIG_MIGRATED) {
+			ret = -EINVAL;
+			ehca_err(ibqp->device, "Invalid mig_state=%x",
+				 attr->path_mig_state);
+			goto modify_qp_exit2;
+		}
+		mqpcb->path_migration_state = attr->path_mig_state + 1;
 		update_mask |=
 			EHCA_BMASK_SET(MQPCB_MASK_PATH_MIGRATION_STATE, 1);
 	}
@@ -1348,7 +1408,7 @@ static int internal_modify_qp(struct ib_qp *ibqp,
 
 	if (h_ret != H_SUCCESS) {
 		ret = ehca2ib_return_code(h_ret);
-		ehca_err(ibqp->device, "hipz_h_modify_qp() failed rc=%lx "
+		ehca_err(ibqp->device, "hipz_h_modify_qp() failed h_ret=%li "
 			 "ehca_qp=%p qp_num=%x", h_ret, my_qp, ibqp->qp_num);
 		goto modify_qp_exit2;
 	}
@@ -1381,7 +1441,7 @@ static int internal_modify_qp(struct ib_qp *ibqp,
 			ret = ehca2ib_return_code(h_ret);
 			ehca_err(ibqp->device, "ENABLE in context of "
 				 "RESET_2_INIT failed! Maybe you didn't get "
-				 "a LID h_ret=%lx ehca_qp=%p qp_num=%x",
+				 "a LID h_ret=%li ehca_qp=%p qp_num=%x",
 				 h_ret, my_qp, ibqp->qp_num);
 			goto modify_qp_exit2;
 		}
@@ -1469,7 +1529,7 @@ int ehca_query_qp(struct ib_qp *qp,
 	if (h_ret != H_SUCCESS) {
 		ret = ehca2ib_return_code(h_ret);
 		ehca_err(qp->device, "hipz_h_query_qp() failed "
-			 "ehca_qp=%p qp_num=%x h_ret=%lx",
+			 "ehca_qp=%p qp_num=%x h_ret=%li",
 			 my_qp, qp->qp_num, h_ret);
 		goto query_qp_exit1;
 	}
@@ -1490,7 +1550,7 @@ int ehca_query_qp(struct ib_qp *qp,
 
 	qp_attr->qkey = qpcb->qkey;
 	qp_attr->path_mtu = qpcb->path_mtu;
-	qp_attr->path_mig_state = qpcb->path_migration_state;
+	qp_attr->path_mig_state = qpcb->path_migration_state - 1;
 	qp_attr->rq_psn = qpcb->receive_psn;
 	qp_attr->sq_psn = qpcb->send_psn;
 	qp_attr->min_rnr_timer = qpcb->min_rnr_nak_timer_field;
@@ -1644,7 +1704,7 @@ int ehca_modify_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr,
 
 	if (h_ret != H_SUCCESS) {
 		ret = ehca2ib_return_code(h_ret);
-		ehca_err(ibsrq->device, "hipz_h_modify_qp() failed rc=%lx "
+		ehca_err(ibsrq->device, "hipz_h_modify_qp() failed h_ret=%li "
 			 "ehca_qp=%p qp_num=%x",
 			 h_ret, my_qp, my_qp->real_qp_num);
 	}
@@ -1687,12 +1747,13 @@ int ehca_query_srq(struct ib_srq *srq, struct ib_srq_attr *srq_attr)
 	if (h_ret != H_SUCCESS) {
 		ret = ehca2ib_return_code(h_ret);
 		ehca_err(srq->device, "hipz_h_query_qp() failed "
-			 "ehca_qp=%p qp_num=%x h_ret=%lx",
+			 "ehca_qp=%p qp_num=%x h_ret=%li",
 			 my_qp, my_qp->real_qp_num, h_ret);
 		goto query_srq_exit1;
 	}
 
 	srq_attr->max_wr = qpcb->max_nr_outst_recv_wr - 1;
+	srq_attr->max_sge = qpcb->actual_nr_sges_in_rq_wqe;
 	srq_attr->srq_limit = EHCA_BMASK_GET(
 		MQPCB_CURR_SRQ_LIMIT, qpcb->curr_srq_limit);
 
@@ -1737,7 +1798,7 @@ static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
 		ret = ehca_cq_unassign_qp(my_qp->send_cq, qp_num);
 		if (ret) {
 			ehca_err(dev, "Couldn't unassign qp from "
-				 "send_cq ret=%x qp_num=%x cq_num=%x", ret,
+				 "send_cq ret=%i qp_num=%x cq_num=%x", ret,
 				 qp_num, my_qp->send_cq->cq_number);
 			return ret;
 		}
@@ -1749,7 +1810,7 @@ static int internal_destroy_qp(struct ib_device *dev, struct ehca_qp *my_qp,
 
 	h_ret = hipz_h_destroy_qp(shca->ipz_hca_handle, my_qp);
 	if (h_ret != H_SUCCESS) {
-		ehca_err(dev, "hipz_h_destroy_qp() failed rc=%lx "
+		ehca_err(dev, "hipz_h_destroy_qp() failed h_ret=%li "
 			 "ehca_qp=%p qp_num=%x", h_ret, my_qp, qp_num);
 		return ehca2ib_return_code(h_ret);
 	}
diff --git a/drivers/infiniband/hw/ehca/ehca_reqs.c b/drivers/infiniband/hw/ehca/ehca_reqs.c
index 94eed70fedf5..ea91360835d3 100644
--- a/drivers/infiniband/hw/ehca/ehca_reqs.c
+++ b/drivers/infiniband/hw/ehca/ehca_reqs.c
@@ -526,7 +526,7 @@ poll_cq_one_read_cqe:
 	if (!cqe) {
 		ret = -EAGAIN;
 		ehca_dbg(cq->device, "Completion queue is empty ehca_cq=%p "
-			 "cq_num=%x ret=%x", my_cq, my_cq->cq_number, ret);
+			 "cq_num=%x ret=%i", my_cq, my_cq->cq_number, ret);
 		goto  poll_cq_one_exit0;
 	}
 
diff --git a/drivers/infiniband/hw/ehca/ehca_sqp.c b/drivers/infiniband/hw/ehca/ehca_sqp.c
index 9f16e9c79394..f0792e5fbd02 100644
--- a/drivers/infiniband/hw/ehca/ehca_sqp.c
+++ b/drivers/infiniband/hw/ehca/ehca_sqp.c
@@ -82,7 +82,7 @@ u64 ehca_define_sqp(struct ehca_shca *shca,
 
 		if (ret != H_SUCCESS) {
 			ehca_err(&shca->ib_device,
-				 "Can't define AQP1 for port %x. rc=%lx",
+				 "Can't define AQP1 for port %x. h_ret=%li",
 				 port, ret);
 			return ret;
 		}
diff --git a/drivers/infiniband/hw/ehca/ehca_tools.h b/drivers/infiniband/hw/ehca/ehca_tools.h
index 57c77a715f46..4a8346a2bc9e 100644
--- a/drivers/infiniband/hw/ehca/ehca_tools.h
+++ b/drivers/infiniband/hw/ehca/ehca_tools.h
@@ -73,40 +73,37 @@ extern int ehca_debug_level;
 		if (unlikely(ehca_debug_level)) \
 			dev_printk(KERN_DEBUG, (ib_dev)->dma_device, \
 				   "PU%04x EHCA_DBG:%s " format "\n", \
-				   get_paca()->paca_index, __FUNCTION__, \
+				   raw_smp_processor_id(), __FUNCTION__, \
 				   ## arg); \
 	} while (0)
 
 #define ehca_info(ib_dev, format, arg...) \
 	dev_info((ib_dev)->dma_device, "PU%04x EHCA_INFO:%s " format "\n", \
-		 get_paca()->paca_index, __FUNCTION__, ## arg)
+		 raw_smp_processor_id(), __FUNCTION__, ## arg)
 
 #define ehca_warn(ib_dev, format, arg...) \
 	dev_warn((ib_dev)->dma_device, "PU%04x EHCA_WARN:%s " format "\n", \
-		 get_paca()->paca_index, __FUNCTION__, ## arg)
+		 raw_smp_processor_id(), __FUNCTION__, ## arg)
 
 #define ehca_err(ib_dev, format, arg...) \
 	dev_err((ib_dev)->dma_device, "PU%04x EHCA_ERR:%s " format "\n", \
-		get_paca()->paca_index, __FUNCTION__, ## arg)
+		raw_smp_processor_id(), __FUNCTION__, ## arg)
 
 /* use this one only if no ib_dev available */
 #define ehca_gen_dbg(format, arg...) \
 	do { \
 		if (unlikely(ehca_debug_level)) \
 			printk(KERN_DEBUG "PU%04x EHCA_DBG:%s " format "\n", \
-			       get_paca()->paca_index, __FUNCTION__, ## arg); \
+			       raw_smp_processor_id(), __FUNCTION__, ## arg); \
 	} while (0)
 
 #define ehca_gen_warn(format, arg...) \
-	do { \
-		if (unlikely(ehca_debug_level)) \
-			printk(KERN_INFO "PU%04x EHCA_WARN:%s " format "\n", \
-			       get_paca()->paca_index, __FUNCTION__, ## arg); \
-	} while (0)
+	printk(KERN_INFO "PU%04x EHCA_WARN:%s " format "\n", \
+	       raw_smp_processor_id(), __FUNCTION__, ## arg)
 
 #define ehca_gen_err(format, arg...) \
 	printk(KERN_ERR "PU%04x EHCA_ERR:%s " format "\n", \
-		get_paca()->paca_index, __FUNCTION__, ## arg)
+	       raw_smp_processor_id(), __FUNCTION__, ## arg)
 
 /**
  * ehca_dmp - printk a memory block, whose length is n*8 bytes.
diff --git a/drivers/infiniband/hw/ehca/ehca_uverbs.c b/drivers/infiniband/hw/ehca/ehca_uverbs.c
index 4bc687fdf531..5234d6c15c49 100644
--- a/drivers/infiniband/hw/ehca/ehca_uverbs.c
+++ b/drivers/infiniband/hw/ehca/ehca_uverbs.c
@@ -109,7 +109,7 @@ static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas,
 	u64 vsize, physical;
 
 	vsize = vma->vm_end - vma->vm_start;
-	if (vsize != EHCA_PAGESIZE) {
+	if (vsize < EHCA_PAGESIZE) {
 		ehca_gen_err("invalid vsize=%lx", vma->vm_end - vma->vm_start);
 		return -EINVAL;
 	}
@@ -118,10 +118,10 @@ static int ehca_mmap_fw(struct vm_area_struct *vma, struct h_galpas *galpas,
 	vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
 	ehca_gen_dbg("vsize=%lx physical=%lx", vsize, physical);
 	/* VM_IO | VM_RESERVED are set by remap_pfn_range() */
-	ret = remap_pfn_range(vma, vma->vm_start, physical >> PAGE_SHIFT,
-			      vsize, vma->vm_page_prot);
+	ret = remap_4k_pfn(vma, vma->vm_start, physical >> EHCA_PAGESHIFT,
+			   vma->vm_page_prot);
 	if (unlikely(ret)) {
-		ehca_gen_err("remap_pfn_range() failed ret=%x", ret);
+		ehca_gen_err("remap_pfn_range() failed ret=%i", ret);
 		return -ENOMEM;
 	}
 
@@ -146,7 +146,7 @@ static int ehca_mmap_queue(struct vm_area_struct *vma, struct ipz_queue *queue,
 		page = virt_to_page(virt_addr);
 		ret = vm_insert_page(vma, start, page);
 		if (unlikely(ret)) {
-			ehca_gen_err("vm_insert_page() failed rc=%x", ret);
+			ehca_gen_err("vm_insert_page() failed rc=%i", ret);
 			return ret;
 		}
 		start += PAGE_SIZE;
@@ -164,23 +164,23 @@ static int ehca_mmap_cq(struct vm_area_struct *vma, struct ehca_cq *cq,
 	int ret;
 
 	switch (rsrc_type) {
-	case 1: /* galpa fw handle */
+	case 0: /* galpa fw handle */
 		ehca_dbg(cq->ib_cq.device, "cq_num=%x fw", cq->cq_number);
 		ret = ehca_mmap_fw(vma, &cq->galpas, &cq->mm_count_galpa);
 		if (unlikely(ret)) {
 			ehca_err(cq->ib_cq.device,
-				 "ehca_mmap_fw() failed rc=%x cq_num=%x",
+				 "ehca_mmap_fw() failed rc=%i cq_num=%x",
 				 ret, cq->cq_number);
 			return ret;
 		}
 		break;
 
-	case 2: /* cq queue_addr */
+	case 1: /* cq queue_addr */
 		ehca_dbg(cq->ib_cq.device, "cq_num=%x queue", cq->cq_number);
 		ret = ehca_mmap_queue(vma, &cq->ipz_queue, &cq->mm_count_queue);
 		if (unlikely(ret)) {
 			ehca_err(cq->ib_cq.device,
-				 "ehca_mmap_queue() failed rc=%x cq_num=%x",
+				 "ehca_mmap_queue() failed rc=%i cq_num=%x",
 				 ret, cq->cq_number);
 			return ret;
 		}
@@ -201,38 +201,38 @@ static int ehca_mmap_qp(struct vm_area_struct *vma, struct ehca_qp *qp,
 	int ret;
 
 	switch (rsrc_type) {
-	case 1: /* galpa fw handle */
+	case 0: /* galpa fw handle */
 		ehca_dbg(qp->ib_qp.device, "qp_num=%x fw", qp->ib_qp.qp_num);
 		ret = ehca_mmap_fw(vma, &qp->galpas, &qp->mm_count_galpa);
 		if (unlikely(ret)) {
 			ehca_err(qp->ib_qp.device,
-				 "remap_pfn_range() failed ret=%x qp_num=%x",
+				 "remap_pfn_range() failed ret=%i qp_num=%x",
 				 ret, qp->ib_qp.qp_num);
 			return -ENOMEM;
 		}
 		break;
 
-	case 2: /* qp rqueue_addr */
+	case 1: /* qp rqueue_addr */
 		ehca_dbg(qp->ib_qp.device, "qp_num=%x rqueue",
 			 qp->ib_qp.qp_num);
 		ret = ehca_mmap_queue(vma, &qp->ipz_rqueue,
 				      &qp->mm_count_rqueue);
 		if (unlikely(ret)) {
 			ehca_err(qp->ib_qp.device,
-				 "ehca_mmap_queue(rq) failed rc=%x qp_num=%x",
+				 "ehca_mmap_queue(rq) failed rc=%i qp_num=%x",
 				 ret, qp->ib_qp.qp_num);
 			return ret;
 		}
 		break;
 
-	case 3: /* qp squeue_addr */
+	case 2: /* qp squeue_addr */
 		ehca_dbg(qp->ib_qp.device, "qp_num=%x squeue",
 			 qp->ib_qp.qp_num);
 		ret = ehca_mmap_queue(vma, &qp->ipz_squeue,
 				      &qp->mm_count_squeue);
 		if (unlikely(ret)) {
 			ehca_err(qp->ib_qp.device,
-				 "ehca_mmap_queue(sq) failed rc=%x qp_num=%x",
+				 "ehca_mmap_queue(sq) failed rc=%i qp_num=%x",
 				 ret, qp->ib_qp.qp_num);
 			return ret;
 		}
@@ -249,10 +249,10 @@ static int ehca_mmap_qp(struct vm_area_struct *vma, struct ehca_qp *qp,
 
 int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 {
-	u64 fileoffset = vma->vm_pgoff << PAGE_SHIFT;
-	u32 idr_handle = fileoffset >> 32;
-	u32 q_type = (fileoffset >> 28) & 0xF;	  /* CQ, QP,...        */
-	u32 rsrc_type = (fileoffset >> 24) & 0xF; /* sq,rq,cmnd_window */
+	u64 fileoffset = vma->vm_pgoff;
+	u32 idr_handle = fileoffset & 0x1FFFFFF;
+	u32 q_type = (fileoffset >> 27) & 0x1;	  /* CQ, QP,...        */
+	u32 rsrc_type = (fileoffset >> 25) & 0x3; /* sq,rq,cmnd_window */
 	u32 cur_pid = current->tgid;
 	u32 ret;
 	struct ehca_cq *cq;
@@ -261,7 +261,7 @@ int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 	struct ib_uobject *uobject;
 
 	switch (q_type) {
-	case  1: /* CQ */
+	case  0: /* CQ */
 		read_lock(&ehca_cq_idr_lock);
 		cq = idr_find(&ehca_cq_idr, idr_handle);
 		read_unlock(&ehca_cq_idr_lock);
@@ -283,13 +283,13 @@ int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 		ret = ehca_mmap_cq(vma, cq, rsrc_type);
 		if (unlikely(ret)) {
 			ehca_err(cq->ib_cq.device,
-				 "ehca_mmap_cq() failed rc=%x cq_num=%x",
+				 "ehca_mmap_cq() failed rc=%i cq_num=%x",
 				 ret, cq->cq_number);
 			return ret;
 		}
 		break;
 
-	case 2: /* QP */
+	case 1: /* QP */
 		read_lock(&ehca_qp_idr_lock);
 		qp = idr_find(&ehca_qp_idr, idr_handle);
 		read_unlock(&ehca_qp_idr_lock);
@@ -313,7 +313,7 @@ int ehca_mmap(struct ib_ucontext *context, struct vm_area_struct *vma)
 		ret = ehca_mmap_qp(vma, qp, rsrc_type);
 		if (unlikely(ret)) {
 			ehca_err(qp->ib_qp.device,
-				 "ehca_mmap_qp() failed rc=%x qp_num=%x",
+				 "ehca_mmap_qp() failed rc=%i qp_num=%x",
 				 ret, qp->ib_qp.qp_num);
 			return ret;
 		}
diff --git a/drivers/infiniband/hw/ehca/hcp_if.c b/drivers/infiniband/hw/ehca/hcp_if.c
index 24f454162f24..c16a21374bb5 100644
--- a/drivers/infiniband/hw/ehca/hcp_if.c
+++ b/drivers/infiniband/hw/ehca/hcp_if.c
@@ -84,6 +84,10 @@
 #define H_MP_SHUTDOWN                   EHCA_BMASK_IBM(48, 48)
 #define H_MP_RESET_QKEY_CTR             EHCA_BMASK_IBM(49, 49)
 
+#define HCALL4_REGS_FORMAT "r4=%lx r5=%lx r6=%lx r7=%lx"
+#define HCALL7_REGS_FORMAT HCALL4_REGS_FORMAT " r8=%lx r9=%lx r10=%lx"
+#define HCALL9_REGS_FORMAT HCALL7_REGS_FORMAT " r11=%lx r12=%lx"
+
 static DEFINE_SPINLOCK(hcall_lock);
 
 static u32 get_longbusy_msecs(int longbusy_rc)
@@ -116,16 +120,28 @@ static long ehca_plpar_hcall_norets(unsigned long opcode,
 				    unsigned long arg7)
 {
 	long ret;
-	int i, sleep_msecs;
+	int i, sleep_msecs, do_lock;
+	unsigned long flags;
 
-	ehca_gen_dbg("opcode=%lx arg1=%lx arg2=%lx arg3=%lx arg4=%lx "
-		     "arg5=%lx arg6=%lx arg7=%lx",
+	ehca_gen_dbg("opcode=%lx " HCALL7_REGS_FORMAT,
 		     opcode, arg1, arg2, arg3, arg4, arg5, arg6, arg7);
 
+	/* lock H_FREE_RESOURCE(MR) against itself and H_ALLOC_RESOURCE(MR) */
+	if ((opcode == H_FREE_RESOURCE) && (arg7 == 5)) {
+		arg7 = 0; /* better not upset firmware */
+		do_lock = 1;
+	}
+
 	for (i = 0; i < 5; i++) {
+		if (do_lock)
+			spin_lock_irqsave(&hcall_lock, flags);
+
 		ret = plpar_hcall_norets(opcode, arg1, arg2, arg3, arg4,
 					 arg5, arg6, arg7);
 
+		if (do_lock)
+			spin_unlock_irqrestore(&hcall_lock, flags);
+
 		if (H_IS_LONG_BUSY(ret)) {
 			sleep_msecs = get_longbusy_msecs(ret);
 			msleep_interruptible(sleep_msecs);
@@ -133,16 +149,13 @@ static long ehca_plpar_hcall_norets(unsigned long opcode,
 		}
 
 		if (ret < H_SUCCESS)
-			ehca_gen_err("opcode=%lx ret=%lx"
-				     " arg1=%lx arg2=%lx arg3=%lx arg4=%lx"
-				     " arg5=%lx arg6=%lx arg7=%lx ",
-				     opcode, ret,
-				     arg1, arg2, arg3, arg4, arg5,
-				     arg6, arg7);
-
-		ehca_gen_dbg("opcode=%lx ret=%lx", opcode, ret);
-		return ret;
+			ehca_gen_err("opcode=%lx ret=%li " HCALL7_REGS_FORMAT,
+				     opcode, ret, arg1, arg2, arg3,
+				     arg4, arg5, arg6, arg7);
+		else
+			ehca_gen_dbg("opcode=%lx ret=%li", opcode, ret);
 
+		return ret;
 	}
 
 	return H_BUSY;
@@ -161,25 +174,24 @@ static long ehca_plpar_hcall9(unsigned long opcode,
 			      unsigned long arg9)
 {
 	long ret;
-	int i, sleep_msecs, lock_is_set = 0;
+	int i, sleep_msecs, do_lock;
 	unsigned long flags = 0;
 
-	ehca_gen_dbg("opcode=%lx arg1=%lx arg2=%lx arg3=%lx arg4=%lx "
-		     "arg5=%lx arg6=%lx arg7=%lx arg8=%lx arg9=%lx",
-		     opcode, arg1, arg2, arg3, arg4, arg5, arg6, arg7,
-		     arg8, arg9);
+	ehca_gen_dbg("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT, opcode,
+		     arg1, arg2, arg3, arg4, arg5, arg6, arg7, arg8, arg9);
+
+	/* lock H_ALLOC_RESOURCE(MR) against itself and H_FREE_RESOURCE(MR) */
+	do_lock = ((opcode == H_ALLOC_RESOURCE) && (arg2 == 5));
 
 	for (i = 0; i < 5; i++) {
-		if ((opcode == H_ALLOC_RESOURCE) && (arg2 == 5)) {
+		if (do_lock)
 			spin_lock_irqsave(&hcall_lock, flags);
-			lock_is_set = 1;
-		}
 
 		ret = plpar_hcall9(opcode, outs,
 				   arg1, arg2, arg3, arg4, arg5,
 				   arg6, arg7, arg8, arg9);
 
-		if (lock_is_set)
+		if (do_lock)
 			spin_unlock_irqrestore(&hcall_lock, flags);
 
 		if (H_IS_LONG_BUSY(ret)) {
@@ -188,26 +200,19 @@ static long ehca_plpar_hcall9(unsigned long opcode,
 			continue;
 		}
 
-		if (ret < H_SUCCESS)
-			ehca_gen_err("opcode=%lx ret=%lx"
-				     " arg1=%lx arg2=%lx arg3=%lx arg4=%lx"
-				     " arg5=%lx arg6=%lx arg7=%lx arg8=%lx"
-				     " arg9=%lx"
-				     " out1=%lx out2=%lx out3=%lx out4=%lx"
-				     " out5=%lx out6=%lx out7=%lx out8=%lx"
-				     " out9=%lx",
-				     opcode, ret,
-				     arg1, arg2, arg3, arg4, arg5,
-				     arg6, arg7, arg8, arg9,
-				     outs[0], outs[1], outs[2], outs[3],
+		if (ret < H_SUCCESS) {
+			ehca_gen_err("INPUT -- opcode=%lx " HCALL9_REGS_FORMAT,
+				     opcode, arg1, arg2, arg3, arg4, arg5,
+				     arg6, arg7, arg8, arg9);
+			ehca_gen_err("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT,
+				     ret, outs[0], outs[1], outs[2], outs[3],
+				     outs[4], outs[5], outs[6], outs[7],
+				     outs[8]);
+		} else
+			ehca_gen_dbg("OUTPUT -- ret=%li " HCALL9_REGS_FORMAT,
+				     ret, outs[0], outs[1], outs[2], outs[3],
 				     outs[4], outs[5], outs[6], outs[7],
 				     outs[8]);
-
-		ehca_gen_dbg("opcode=%lx ret=%lx out1=%lx out2=%lx out3=%lx "
-			     "out4=%lx out5=%lx out6=%lx out7=%lx out8=%lx "
-			     "out9=%lx",
-			     opcode, ret, outs[0], outs[1], outs[2], outs[3],
-			     outs[4], outs[5], outs[6], outs[7], outs[8]);
 		return ret;
 	}
 
@@ -247,7 +252,7 @@ u64 hipz_h_alloc_resource_eq(const struct ipz_adapter_handle adapter_handle,
 	*eq_ist = (u32)outs[5];
 
 	if (ret == H_NOT_ENOUGH_RESOURCES)
-		ehca_gen_err("Not enough resource - ret=%lx ", ret);
+		ehca_gen_err("Not enough resource - ret=%li ", ret);
 
 	return ret;
 }
@@ -285,7 +290,7 @@ u64 hipz_h_alloc_resource_cq(const struct ipz_adapter_handle adapter_handle,
 		hcp_galpas_ctor(&cq->galpas, outs[5], outs[6]);
 
 	if (ret == H_NOT_ENOUGH_RESOURCES)
-		ehca_gen_err("Not enough resources. ret=%lx", ret);
+		ehca_gen_err("Not enough resources. ret=%li", ret);
 
 	return ret;
 }
@@ -360,7 +365,7 @@ u64 hipz_h_alloc_resource_qp(const struct ipz_adapter_handle adapter_handle,
 		hcp_galpas_ctor(&parms->galpas, outs[6], outs[6]);
 
 	if (ret == H_NOT_ENOUGH_RESOURCES)
-		ehca_gen_err("Not enough resources. ret=%lx", ret);
+		ehca_gen_err("Not enough resources. ret=%li", ret);
 
 	return ret;
 }
@@ -555,7 +560,7 @@ u64 hipz_h_modify_qp(const struct ipz_adapter_handle adapter_handle,
 				0, 0, 0, 0, 0);
 
 	if (ret == H_NOT_ENOUGH_RESOURCES)
-		ehca_gen_err("Insufficient resources ret=%lx", ret);
+		ehca_gen_err("Insufficient resources ret=%li", ret);
 
 	return ret;
 }
@@ -591,7 +596,7 @@ u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle,
 				qp->ipz_qp_handle.handle,  /* r6 */
 				0, 0, 0, 0, 0, 0);
 	if (ret == H_HARDWARE)
-		ehca_gen_err("HCA not operational. ret=%lx", ret);
+		ehca_gen_err("HCA not operational. ret=%li", ret);
 
 	ret = ehca_plpar_hcall_norets(H_FREE_RESOURCE,
 				      adapter_handle.handle,     /* r4 */
@@ -599,7 +604,7 @@ u64 hipz_h_destroy_qp(const struct ipz_adapter_handle adapter_handle,
 				      0, 0, 0, 0, 0);
 
 	if (ret == H_RESOURCE)
-		ehca_gen_err("Resource still in use. ret=%lx", ret);
+		ehca_gen_err("Resource still in use. ret=%li", ret);
 
 	return ret;
 }
@@ -634,7 +639,7 @@ u64 hipz_h_define_aqp1(const struct ipz_adapter_handle adapter_handle,
 	*bma_qp_nr = (u32)outs[1];
 
 	if (ret == H_ALIAS_EXIST)
-		ehca_gen_err("AQP1 already exists. ret=%lx", ret);
+		ehca_gen_err("AQP1 already exists. ret=%li", ret);
 
 	return ret;
 }
@@ -656,7 +661,7 @@ u64 hipz_h_attach_mcqp(const struct ipz_adapter_handle adapter_handle,
 				      0, 0);
 
 	if (ret == H_NOT_ENOUGH_RESOURCES)
-		ehca_gen_err("Not enough resources. ret=%lx", ret);
+		ehca_gen_err("Not enough resources. ret=%li", ret);
 
 	return ret;
 }
@@ -695,7 +700,7 @@ u64 hipz_h_destroy_cq(const struct ipz_adapter_handle adapter_handle,
 				      0, 0, 0, 0);
 
 	if (ret == H_RESOURCE)
-		ehca_gen_err("H_FREE_RESOURCE failed ret=%lx ", ret);
+		ehca_gen_err("H_FREE_RESOURCE failed ret=%li ", ret);
 
 	return ret;
 }
@@ -717,7 +722,7 @@ u64 hipz_h_destroy_eq(const struct ipz_adapter_handle adapter_handle,
 				      0, 0, 0, 0, 0);
 
 	if (ret == H_RESOURCE)
-		ehca_gen_err("Resource in use. ret=%lx ", ret);
+		ehca_gen_err("Resource in use. ret=%li ", ret);
 
 	return ret;
 }
@@ -816,7 +821,7 @@ u64 hipz_h_free_resource_mr(const struct ipz_adapter_handle adapter_handle,
 	return ehca_plpar_hcall_norets(H_FREE_RESOURCE,
 				       adapter_handle.handle,    /* r4 */
 				       mr->ipz_mr_handle.handle, /* r5 */
-				       0, 0, 0, 0, 0);
+				       0, 0, 0, 0, 5);
 }
 
 u64 hipz_h_reregister_pmr(const struct ipz_adapter_handle adapter_handle,
diff --git a/drivers/infiniband/hw/ehca/ipz_pt_fn.c b/drivers/infiniband/hw/ehca/ipz_pt_fn.c
index 29bd476fbd54..661f8db62706 100644
--- a/drivers/infiniband/hw/ehca/ipz_pt_fn.c
+++ b/drivers/infiniband/hw/ehca/ipz_pt_fn.c
@@ -158,6 +158,7 @@ static int alloc_small_queue_page(struct ipz_queue *queue, struct ehca_pd *pd)
 
 	queue->queue_pages[0] = (void *)(page->page | (bit << (order + 9)));
 	queue->small_page = page;
+	queue->offset = bit << (order + 9);
 	return 1;
 
 out:
diff --git a/drivers/infiniband/hw/ipath/ipath_common.h b/drivers/infiniband/hw/ipath/ipath_common.h
index 6ad822c35930..851df8a75e79 100644
--- a/drivers/infiniband/hw/ipath/ipath_common.h
+++ b/drivers/infiniband/hw/ipath/ipath_common.h
@@ -189,6 +189,8 @@ typedef enum _ipath_ureg {
 #define IPATH_RUNTIME_RCVHDR_COPY	0x8
 #define IPATH_RUNTIME_MASTER	0x10
 /* 0x20 and 0x40 are no longer used, but are reserved for ABI compatibility */
+#define IPATH_RUNTIME_FORCE_PIOAVAIL 0x400
+#define IPATH_RUNTIME_PIO_REGSWAPPED 0x800
 
 /*
  * This structure is returned by ipath_userinit() immediately after
@@ -350,7 +352,7 @@ struct ipath_base_info {
  * may not be implemented; the user code must deal with this if it
  * cares, or it must abort after initialization reports the difference.
  */
-#define IPATH_USER_SWMINOR 5
+#define IPATH_USER_SWMINOR 6
 
 #define IPATH_USER_SWVERSION ((IPATH_USER_SWMAJOR<<16) | IPATH_USER_SWMINOR)
 
diff --git a/drivers/infiniband/hw/ipath/ipath_cq.c b/drivers/infiniband/hw/ipath/ipath_cq.c
index a6f04d27ec57..645ed71fd797 100644
--- a/drivers/infiniband/hw/ipath/ipath_cq.c
+++ b/drivers/infiniband/hw/ipath/ipath_cq.c
@@ -76,22 +76,25 @@ void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int solicited)
 		}
 		return;
 	}
-	wc->queue[head].wr_id = entry->wr_id;
-	wc->queue[head].status = entry->status;
-	wc->queue[head].opcode = entry->opcode;
-	wc->queue[head].vendor_err = entry->vendor_err;
-	wc->queue[head].byte_len = entry->byte_len;
-	wc->queue[head].imm_data = (__u32 __force)entry->imm_data;
-	wc->queue[head].qp_num = entry->qp->qp_num;
-	wc->queue[head].src_qp = entry->src_qp;
-	wc->queue[head].wc_flags = entry->wc_flags;
-	wc->queue[head].pkey_index = entry->pkey_index;
-	wc->queue[head].slid = entry->slid;
-	wc->queue[head].sl = entry->sl;
-	wc->queue[head].dlid_path_bits = entry->dlid_path_bits;
-	wc->queue[head].port_num = entry->port_num;
-	/* Make sure queue entry is written before the head index. */
-	smp_wmb();
+	if (cq->ip) {
+		wc->uqueue[head].wr_id = entry->wr_id;
+		wc->uqueue[head].status = entry->status;
+		wc->uqueue[head].opcode = entry->opcode;
+		wc->uqueue[head].vendor_err = entry->vendor_err;
+		wc->uqueue[head].byte_len = entry->byte_len;
+		wc->uqueue[head].imm_data = (__u32 __force)entry->imm_data;
+		wc->uqueue[head].qp_num = entry->qp->qp_num;
+		wc->uqueue[head].src_qp = entry->src_qp;
+		wc->uqueue[head].wc_flags = entry->wc_flags;
+		wc->uqueue[head].pkey_index = entry->pkey_index;
+		wc->uqueue[head].slid = entry->slid;
+		wc->uqueue[head].sl = entry->sl;
+		wc->uqueue[head].dlid_path_bits = entry->dlid_path_bits;
+		wc->uqueue[head].port_num = entry->port_num;
+		/* Make sure entry is written before the head index. */
+		smp_wmb();
+	} else
+		wc->kqueue[head] = *entry;
 	wc->head = next;
 
 	if (cq->notify == IB_CQ_NEXT_COMP ||
@@ -130,6 +133,12 @@ int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
 	int npolled;
 	u32 tail;
 
+	/* The kernel can only poll a kernel completion queue */
+	if (cq->ip) {
+		npolled = -EINVAL;
+		goto bail;
+	}
+
 	spin_lock_irqsave(&cq->lock, flags);
 
 	wc = cq->queue;
@@ -137,31 +146,10 @@ int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
 	if (tail > (u32) cq->ibcq.cqe)
 		tail = (u32) cq->ibcq.cqe;
 	for (npolled = 0; npolled < num_entries; ++npolled, ++entry) {
-		struct ipath_qp *qp;
-
 		if (tail == wc->head)
 			break;
-		/* Make sure entry is read after head index is read. */
-		smp_rmb();
-		qp = ipath_lookup_qpn(&to_idev(cq->ibcq.device)->qp_table,
-				      wc->queue[tail].qp_num);
-		entry->qp = &qp->ibqp;
-		if (atomic_dec_and_test(&qp->refcount))
-			wake_up(&qp->wait);
-
-		entry->wr_id = wc->queue[tail].wr_id;
-		entry->status = wc->queue[tail].status;
-		entry->opcode = wc->queue[tail].opcode;
-		entry->vendor_err = wc->queue[tail].vendor_err;
-		entry->byte_len = wc->queue[tail].byte_len;
-		entry->imm_data = wc->queue[tail].imm_data;
-		entry->src_qp = wc->queue[tail].src_qp;
-		entry->wc_flags = wc->queue[tail].wc_flags;
-		entry->pkey_index = wc->queue[tail].pkey_index;
-		entry->slid = wc->queue[tail].slid;
-		entry->sl = wc->queue[tail].sl;
-		entry->dlid_path_bits = wc->queue[tail].dlid_path_bits;
-		entry->port_num = wc->queue[tail].port_num;
+		/* The kernel doesn't need a RMB since it has the lock. */
+		*entry = wc->kqueue[tail];
 		if (tail >= cq->ibcq.cqe)
 			tail = 0;
 		else
@@ -171,6 +159,7 @@ int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry)
 
 	spin_unlock_irqrestore(&cq->lock, flags);
 
+bail:
 	return npolled;
 }
 
@@ -215,6 +204,7 @@ struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, int comp_vec
 	struct ipath_cq *cq;
 	struct ipath_cq_wc *wc;
 	struct ib_cq *ret;
+	u32 sz;
 
 	if (entries < 1 || entries > ib_ipath_max_cqes) {
 		ret = ERR_PTR(-EINVAL);
@@ -235,7 +225,12 @@ struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, int comp_vec
 	 * We need to use vmalloc() in order to support mmap and large
 	 * numbers of entries.
 	 */
-	wc = vmalloc_user(sizeof(*wc) + sizeof(struct ib_wc) * entries);
+	sz = sizeof(*wc);
+	if (udata && udata->outlen >= sizeof(__u64))
+		sz += sizeof(struct ib_uverbs_wc) * (entries + 1);
+	else
+		sz += sizeof(struct ib_wc) * (entries + 1);
+	wc = vmalloc_user(sz);
 	if (!wc) {
 		ret = ERR_PTR(-ENOMEM);
 		goto bail_cq;
@@ -247,9 +242,8 @@ struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, int comp_vec
 	 */
 	if (udata && udata->outlen >= sizeof(__u64)) {
 		int err;
-		u32 s = sizeof *wc + sizeof(struct ib_wc) * entries;
 
-		cq->ip = ipath_create_mmap_info(dev, s, context, wc);
+		cq->ip = ipath_create_mmap_info(dev, sz, context, wc);
 		if (!cq->ip) {
 			ret = ERR_PTR(-ENOMEM);
 			goto bail_wc;
@@ -380,6 +374,7 @@ int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
 	struct ipath_cq_wc *wc;
 	u32 head, tail, n;
 	int ret;
+	u32 sz;
 
 	if (cqe < 1 || cqe > ib_ipath_max_cqes) {
 		ret = -EINVAL;
@@ -389,7 +384,12 @@ int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
 	/*
 	 * Need to use vmalloc() if we want to support large #s of entries.
 	 */
-	wc = vmalloc_user(sizeof(*wc) + sizeof(struct ib_wc) * cqe);
+	sz = sizeof(*wc);
+	if (udata && udata->outlen >= sizeof(__u64))
+		sz += sizeof(struct ib_uverbs_wc) * (cqe + 1);
+	else
+		sz += sizeof(struct ib_wc) * (cqe + 1);
+	wc = vmalloc_user(sz);
 	if (!wc) {
 		ret = -ENOMEM;
 		goto bail;
@@ -430,7 +430,10 @@ int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
 		goto bail;
 	}
 	for (n = 0; tail != head; n++) {
-		wc->queue[n] = old_wc->queue[tail];
+		if (cq->ip)
+			wc->uqueue[n] = old_wc->uqueue[tail];
+		else
+			wc->kqueue[n] = old_wc->kqueue[tail];
 		if (tail == (u32) cq->ibcq.cqe)
 			tail = 0;
 		else
@@ -447,9 +450,8 @@ int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata)
 	if (cq->ip) {
 		struct ipath_ibdev *dev = to_idev(ibcq->device);
 		struct ipath_mmap_info *ip = cq->ip;
-		u32 s = sizeof *wc + sizeof(struct ib_wc) * cqe;
 
-		ipath_update_mmap_info(dev, ip, s, wc);
+		ipath_update_mmap_info(dev, ip, sz, wc);
 		spin_lock_irq(&dev->pending_lock);
 		if (list_empty(&ip->pending_mmaps))
 			list_add(&ip->pending_mmaps, &dev->pending_mmaps);
diff --git a/drivers/infiniband/hw/ipath/ipath_diag.c b/drivers/infiniband/hw/ipath/ipath_diag.c
index cf25cdab02f9..4137c7770f1b 100644
--- a/drivers/infiniband/hw/ipath/ipath_diag.c
+++ b/drivers/infiniband/hw/ipath/ipath_diag.c
@@ -446,19 +446,21 @@ static ssize_t ipath_diagpkt_write(struct file *fp,
 			   dd->ipath_unit, plen - 1, pbufn);
 
 	if (dp.pbc_wd == 0)
-		/* Legacy operation, use computed pbc_wd */
 		dp.pbc_wd = plen;
-
-	/* we have to flush after the PBC for correctness on some cpus
-	 * or WC buffer can be written out of order */
 	writeq(dp.pbc_wd, piobuf);
-	ipath_flush_wc();
-	/* copy all by the trigger word, then flush, so it's written
+	/*
+	 * Copy all by the trigger word, then flush, so it's written
 	 * to chip before trigger word, then write trigger word, then
-	 * flush again, so packet is sent. */
-	__iowrite32_copy(piobuf + 2, tmpbuf, clen - 1);
-	ipath_flush_wc();
-	__raw_writel(tmpbuf[clen - 1], piobuf + clen + 1);
+	 * flush again, so packet is sent.
+	 */
+	if (dd->ipath_flags & IPATH_PIO_FLUSH_WC) {
+		ipath_flush_wc();
+		__iowrite32_copy(piobuf + 2, tmpbuf, clen - 1);
+		ipath_flush_wc();
+		__raw_writel(tmpbuf[clen - 1], piobuf + clen + 1);
+	} else
+		__iowrite32_copy(piobuf + 2, tmpbuf, clen);
+
 	ipath_flush_wc();
 
 	ret = sizeof(dp);
diff --git a/drivers/infiniband/hw/ipath/ipath_driver.c b/drivers/infiniband/hw/ipath/ipath_driver.c
index 6ccba365a24c..1f152ded1e3c 100644
--- a/drivers/infiniband/hw/ipath/ipath_driver.c
+++ b/drivers/infiniband/hw/ipath/ipath_driver.c
@@ -34,6 +34,7 @@
 #include <linux/spinlock.h>
 #include <linux/idr.h>
 #include <linux/pci.h>
+#include <linux/io.h>
 #include <linux/delay.h>
 #include <linux/netdevice.h>
 #include <linux/vmalloc.h>
@@ -280,6 +281,89 @@ void __attribute__((weak)) ipath_disable_wc(struct ipath_devdata *dd)
 {
 }
 
+/*
+ * Perform a PIO buffer bandwidth write test, to verify proper system
+ * configuration.  Even when all the setup calls work, occasionally
+ * BIOS or other issues can prevent write combining from working, or
+ * can cause other bandwidth problems to the chip.
+ *
+ * This test simply writes the same buffer over and over again, and
+ * measures close to the peak bandwidth to the chip (not testing
+ * data bandwidth to the wire).   On chips that use an address-based
+ * trigger to send packets to the wire, this is easy.  On chips that
+ * use a count to trigger, we want to make sure that the packet doesn't
+ * go out on the wire, or trigger flow control checks.
+ */
+static void ipath_verify_pioperf(struct ipath_devdata *dd)
+{
+	u32 pbnum, cnt, lcnt;
+	u32 __iomem *piobuf;
+	u32 *addr;
+	u64 msecs, emsecs;
+
+	piobuf = ipath_getpiobuf(dd, &pbnum);
+	if (!piobuf) {
+		dev_info(&dd->pcidev->dev,
+			"No PIObufs for checking perf, skipping\n");
+		return;
+	}
+
+	/*
+	 * Enough to give us a reasonable test, less than piobuf size, and
+	 * likely multiple of store buffer length.
+	 */
+	cnt = 1024;
+
+	addr = vmalloc(cnt);
+	if (!addr) {
+		dev_info(&dd->pcidev->dev,
+			"Couldn't get memory for checking PIO perf,"
+			" skipping\n");
+		goto done;
+	}
+
+	preempt_disable();  /* we want reasonably accurate elapsed time */
+	msecs = 1 + jiffies_to_msecs(jiffies);
+	for (lcnt = 0; lcnt < 10000U; lcnt++) {
+		/* wait until we cross msec boundary */
+		if (jiffies_to_msecs(jiffies) >= msecs)
+			break;
+		udelay(1);
+	}
+
+	writeq(0, piobuf); /* length 0, no dwords actually sent */
+	ipath_flush_wc();
+
+	/*
+	 * this is only roughly accurate, since even with preempt we
+	 * still take interrupts that could take a while.   Running for
+	 * >= 5 msec seems to get us "close enough" to accurate values
+	 */
+	msecs = jiffies_to_msecs(jiffies);
+	for (emsecs = lcnt = 0; emsecs <= 5UL; lcnt++) {
+		__iowrite32_copy(piobuf + 64, addr, cnt >> 2);
+		emsecs = jiffies_to_msecs(jiffies) - msecs;
+	}
+
+	/* 1 GiB/sec, slightly over IB SDR line rate */
+	if (lcnt < (emsecs * 1024U))
+		ipath_dev_err(dd,
+			"Performance problem: bandwidth to PIO buffers is "
+			"only %u MiB/sec\n",
+			lcnt / (u32) emsecs);
+	else
+		ipath_dbg("PIO buffer bandwidth %u MiB/sec is OK\n",
+			lcnt / (u32) emsecs);
+
+	preempt_enable();
+
+	vfree(addr);
+
+done:
+	/* disarm piobuf, so it's available again */
+	ipath_disarm_piobufs(dd, pbnum, 1);
+}
+
 static int __devinit ipath_init_one(struct pci_dev *pdev,
 				    const struct pci_device_id *ent)
 {
@@ -298,8 +382,6 @@ static int __devinit ipath_init_one(struct pci_dev *pdev,
 
 	ipath_cdbg(VERBOSE, "initializing unit #%u\n", dd->ipath_unit);
 
-	read_bars(dd, pdev, &bar0, &bar1);
-
 	ret = pci_enable_device(pdev);
 	if (ret) {
 		/* This can happen iff:
@@ -445,9 +527,6 @@ static int __devinit ipath_init_one(struct pci_dev *pdev,
 		goto bail_regions;
 	}
 
-	dd->ipath_deviceid = ent->device;	/* save for later use */
-	dd->ipath_vendorid = ent->vendor;
-
 	dd->ipath_pcirev = pdev->revision;
 
 #if defined(__powerpc__)
@@ -515,6 +594,8 @@ static int __devinit ipath_init_one(struct pci_dev *pdev,
 		ret = 0;
 	}
 
+	ipath_verify_pioperf(dd);
+
 	ipath_device_create_group(&pdev->dev, dd);
 	ipathfs_add_device(dd);
 	ipath_user_add(dd);
@@ -2005,6 +2086,8 @@ void ipath_shutdown_device(struct ipath_devdata *dd)
 			    INFINIPATH_IBCC_LINKINITCMD_SHIFT);
 	ipath_cancel_sends(dd, 0);
 
+	signal_ib_event(dd, IB_EVENT_PORT_ERR);
+
 	/* disable IBC */
 	dd->ipath_control &= ~INFINIPATH_C_LINKENABLE;
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_control,
diff --git a/drivers/infiniband/hw/ipath/ipath_eeprom.c b/drivers/infiniband/hw/ipath/ipath_eeprom.c
index b4503e9c1e95..bcfa3ccb555f 100644
--- a/drivers/infiniband/hw/ipath/ipath_eeprom.c
+++ b/drivers/infiniband/hw/ipath/ipath_eeprom.c
@@ -596,7 +596,11 @@ void ipath_get_eeprom_info(struct ipath_devdata *dd)
 		goto bail;
 	}
 
-	len = offsetof(struct ipath_flash, if_future);
+	/*
+	 * read full flash, not just currently used part, since it may have
+	 * been written with a newer definition
+	 * */
+	len = sizeof(struct ipath_flash);
 	buf = vmalloc(len);
 	if (!buf) {
 		ipath_dev_err(dd, "Couldn't allocate memory to read %u "
@@ -737,8 +741,10 @@ int ipath_update_eeprom_log(struct ipath_devdata *dd)
 	/*
 	 * The quick-check above determined that there is something worthy
 	 * of logging, so get current contents and do a more detailed idea.
+	 * read full flash, not just currently used part, since it may have
+	 * been written with a newer definition
 	 */
-	len = offsetof(struct ipath_flash, if_future);
+	len = sizeof(struct ipath_flash);
 	buf = vmalloc(len);
 	ret = 1;
 	if (!buf) {
diff --git a/drivers/infiniband/hw/ipath/ipath_file_ops.c b/drivers/infiniband/hw/ipath/ipath_file_ops.c
index 33ab0d6b80ff..5de3243a47c3 100644
--- a/drivers/infiniband/hw/ipath/ipath_file_ops.c
+++ b/drivers/infiniband/hw/ipath/ipath_file_ops.c
@@ -538,6 +538,9 @@ static int ipath_tid_free(struct ipath_portdata *pd, unsigned subport,
 			continue;
 		cnt++;
 		if (dd->ipath_pageshadow[porttid + tid]) {
+			struct page *p;
+			p = dd->ipath_pageshadow[porttid + tid];
+			dd->ipath_pageshadow[porttid + tid] = NULL;
 			ipath_cdbg(VERBOSE, "PID %u freeing TID %u\n",
 				   pd->port_pid, tid);
 			dd->ipath_f_put_tid(dd, &tidbase[tid],
@@ -546,9 +549,7 @@ static int ipath_tid_free(struct ipath_portdata *pd, unsigned subport,
 			pci_unmap_page(dd->pcidev,
 				dd->ipath_physshadow[porttid + tid],
 				PAGE_SIZE, PCI_DMA_FROMDEVICE);
-			ipath_release_user_pages(
-				&dd->ipath_pageshadow[porttid + tid], 1);
-			dd->ipath_pageshadow[porttid + tid] = NULL;
+			ipath_release_user_pages(&p, 1);
 			ipath_stats.sps_pageunlocks++;
 		} else
 			ipath_dbg("Unused tid %u, ignoring\n", tid);
@@ -1341,6 +1342,19 @@ bail:
 	return ret;
 }
 
+static unsigned ipath_poll_hdrqfull(struct ipath_portdata *pd)
+{
+	unsigned pollflag = 0;
+
+	if ((pd->poll_type & IPATH_POLL_TYPE_OVERFLOW) &&
+	    pd->port_hdrqfull != pd->port_hdrqfull_poll) {
+		pollflag |= POLLIN | POLLRDNORM;
+		pd->port_hdrqfull_poll = pd->port_hdrqfull;
+	}
+
+	return pollflag;
+}
+
 static unsigned int ipath_poll_urgent(struct ipath_portdata *pd,
 				      struct file *fp,
 				      struct poll_table_struct *pt)
@@ -1350,22 +1364,20 @@ static unsigned int ipath_poll_urgent(struct ipath_portdata *pd,
 
 	dd = pd->port_dd;
 
-	if (test_bit(IPATH_PORT_WAITING_OVERFLOW, &pd->int_flag)) {
-		pollflag |= POLLERR;
-		clear_bit(IPATH_PORT_WAITING_OVERFLOW, &pd->int_flag);
-	}
+	/* variable access in ipath_poll_hdrqfull() needs this */
+	rmb();
+	pollflag = ipath_poll_hdrqfull(pd);
 
-	if (test_bit(IPATH_PORT_WAITING_URG, &pd->int_flag)) {
+	if (pd->port_urgent != pd->port_urgent_poll) {
 		pollflag |= POLLIN | POLLRDNORM;
-		clear_bit(IPATH_PORT_WAITING_URG, &pd->int_flag);
+		pd->port_urgent_poll = pd->port_urgent;
 	}
 
 	if (!pollflag) {
+		/* this saves a spin_lock/unlock in interrupt handler... */
 		set_bit(IPATH_PORT_WAITING_URG, &pd->port_flag);
-		if (pd->poll_type & IPATH_POLL_TYPE_OVERFLOW)
-			set_bit(IPATH_PORT_WAITING_OVERFLOW,
-				&pd->port_flag);
-
+		/* flush waiting flag so don't miss an event... */
+		wmb();
 		poll_wait(fp, &pd->port_wait, pt);
 	}
 
@@ -1376,31 +1388,27 @@ static unsigned int ipath_poll_next(struct ipath_portdata *pd,
 				    struct file *fp,
 				    struct poll_table_struct *pt)
 {
-	u32 head, tail;
+	u32 head;
+	u32 tail;
 	unsigned pollflag = 0;
 	struct ipath_devdata *dd;
 
 	dd = pd->port_dd;
 
+	/* variable access in ipath_poll_hdrqfull() needs this */
+	rmb();
+	pollflag = ipath_poll_hdrqfull(pd);
+
 	head = ipath_read_ureg32(dd, ur_rcvhdrhead, pd->port_port);
 	tail = *(volatile u64 *)pd->port_rcvhdrtail_kvaddr;
 
-	if (test_bit(IPATH_PORT_WAITING_OVERFLOW, &pd->int_flag)) {
-		pollflag |= POLLERR;
-		clear_bit(IPATH_PORT_WAITING_OVERFLOW, &pd->int_flag);
-	}
-
-	if (tail != head ||
-	    test_bit(IPATH_PORT_WAITING_RCV, &pd->int_flag)) {
+	if (head != tail)
 		pollflag |= POLLIN | POLLRDNORM;
-		clear_bit(IPATH_PORT_WAITING_RCV, &pd->int_flag);
-	}
-
-	if (!pollflag) {
+	else {
+		/* this saves a spin_lock/unlock in interrupt handler */
 		set_bit(IPATH_PORT_WAITING_RCV, &pd->port_flag);
-		if (pd->poll_type & IPATH_POLL_TYPE_OVERFLOW)
-			set_bit(IPATH_PORT_WAITING_OVERFLOW,
-				&pd->port_flag);
+		/* flush waiting flag so we don't miss an event */
+		wmb();
 
 		set_bit(pd->port_port + INFINIPATH_R_INTRAVAIL_SHIFT,
 			&dd->ipath_rcvctrl);
@@ -1917,6 +1925,12 @@ static int ipath_do_user_init(struct file *fp,
 	ipath_cdbg(VERBOSE, "Wrote port%d egrhead %x from tail regs\n",
 		pd->port_port, head32);
 	pd->port_tidcursor = 0;	/* start at beginning after open */
+
+	/* initialize poll variables... */
+	pd->port_urgent = 0;
+	pd->port_urgent_poll = 0;
+	pd->port_hdrqfull_poll = pd->port_hdrqfull;
+
 	/*
 	 * now enable the port; the tail registers will be written to memory
 	 * by the chip as soon as it sees the write to
@@ -2039,9 +2053,11 @@ static int ipath_close(struct inode *in, struct file *fp)
 
 	if (dd->ipath_kregbase) {
 		int i;
-		/* atomically clear receive enable port. */
+		/* atomically clear receive enable port and intr avail. */
 		clear_bit(INFINIPATH_R_PORTENABLE_SHIFT + port,
 			  &dd->ipath_rcvctrl);
+		clear_bit(pd->port_port + INFINIPATH_R_INTRAVAIL_SHIFT,
+			  &dd->ipath_rcvctrl);
 		ipath_write_kreg( dd, dd->ipath_kregs->kr_rcvctrl,
 			dd->ipath_rcvctrl);
 		/* and read back from chip to be sure that nothing
diff --git a/drivers/infiniband/hw/ipath/ipath_fs.c b/drivers/infiniband/hw/ipath/ipath_fs.c
index 2e689b974e1f..262c25db05cd 100644
--- a/drivers/infiniband/hw/ipath/ipath_fs.c
+++ b/drivers/infiniband/hw/ipath/ipath_fs.c
@@ -130,175 +130,6 @@ static const struct file_operations atomic_counters_ops = {
 	.read = atomic_counters_read,
 };
 
-static ssize_t atomic_node_info_read(struct file *file, char __user *buf,
-				     size_t count, loff_t *ppos)
-{
-	u32 nodeinfo[10];
-	struct ipath_devdata *dd;
-	u64 guid;
-
-	dd = file->f_path.dentry->d_inode->i_private;
-
-	guid = be64_to_cpu(dd->ipath_guid);
-
-	nodeinfo[0] =			/* BaseVersion is SMA */
-		/* ClassVersion is SMA */
-		(1 << 8)		/* NodeType  */
-		| (1 << 0);		/* NumPorts */
-	nodeinfo[1] = (u32) (guid >> 32);
-	nodeinfo[2] = (u32) (guid & 0xffffffff);
-	/* PortGUID == SystemImageGUID for us */
-	nodeinfo[3] = nodeinfo[1];
-	/* PortGUID == SystemImageGUID for us */
-	nodeinfo[4] = nodeinfo[2];
-	/* PortGUID == NodeGUID for us */
-	nodeinfo[5] = nodeinfo[3];
-	/* PortGUID == NodeGUID for us */
-	nodeinfo[6] = nodeinfo[4];
-	nodeinfo[7] = (4 << 16) /* we support 4 pkeys */
-		| (dd->ipath_deviceid << 0);
-	/* our chip version as 16 bits major, 16 bits minor */
-	nodeinfo[8] = dd->ipath_minrev | (dd->ipath_majrev << 16);
-	nodeinfo[9] = (dd->ipath_unit << 24) | (dd->ipath_vendorid << 0);
-
-	return simple_read_from_buffer(buf, count, ppos, nodeinfo,
-				       sizeof nodeinfo);
-}
-
-static const struct file_operations atomic_node_info_ops = {
-	.read = atomic_node_info_read,
-};
-
-static ssize_t atomic_port_info_read(struct file *file, char __user *buf,
-				     size_t count, loff_t *ppos)
-{
-	u32 portinfo[13];
-	u32 tmp, tmp2;
-	struct ipath_devdata *dd;
-
-	dd = file->f_path.dentry->d_inode->i_private;
-
-	/* so we only initialize non-zero fields. */
-	memset(portinfo, 0, sizeof portinfo);
-
-	/*
-	 * Notimpl yet M_Key (64)
-	 * Notimpl yet GID (64)
-	 */
-
-	portinfo[4] = (dd->ipath_lid << 16);
-
-	/*
-	 * Notimpl yet SMLID.
-	 * CapabilityMask is 0, we don't support any of these
-	 * DiagCode is 0; we don't store any diag info for now Notimpl yet
-	 * M_KeyLeasePeriod (we don't support M_Key)
-	 */
-
-	/* LocalPortNum is whichever port number they ask for */
-	portinfo[7] = (dd->ipath_unit << 24)
-		/* LinkWidthEnabled */
-		| (2 << 16)
-		/* LinkWidthSupported (really 2, but not IB valid) */
-		| (3 << 8)
-		/* LinkWidthActive */
-		| (2 << 0);
-	tmp = dd->ipath_lastibcstat & IPATH_IBSTATE_MASK;
-	tmp2 = 5;
-	if (tmp == IPATH_IBSTATE_INIT)
-		tmp = 2;
-	else if (tmp == IPATH_IBSTATE_ARM)
-		tmp = 3;
-	else if (tmp == IPATH_IBSTATE_ACTIVE)
-		tmp = 4;
-	else {
-		tmp = 0;	/* down */
-		tmp2 = tmp & 0xf;
-	}
-
-	portinfo[8] = (1 << 28)	/* LinkSpeedSupported */
-		| (tmp << 24)	/* PortState */
-		| (tmp2 << 20)	/* PortPhysicalState */
-		| (2 << 16)
-
-		/* LinkDownDefaultState */
-		/* M_KeyProtectBits == 0 */
-		/* NotImpl yet LMC == 0 (we can support all values) */
-		| (1 << 4)	/* LinkSpeedActive */
-		| (1 << 0);	/* LinkSpeedEnabled */
-	switch (dd->ipath_ibmtu) {
-	case 4096:
-		tmp = 5;
-		break;
-	case 2048:
-		tmp = 4;
-		break;
-	case 1024:
-		tmp = 3;
-		break;
-	case 512:
-		tmp = 2;
-		break;
-	case 256:
-		tmp = 1;
-		break;
-	default:		/* oops, something is wrong */
-		ipath_dbg("Problem, ipath_ibmtu 0x%x not a valid IB MTU, "
-			  "treat as 2048\n", dd->ipath_ibmtu);
-		tmp = 4;
-		break;
-	}
-	portinfo[9] = (tmp << 28)
-		/* NeighborMTU */
-		/* Notimpl MasterSMSL */
-		| (1 << 20)
-
-		/* VLCap */
-		/* Notimpl InitType (actually, an SMA decision) */
-		/* VLHighLimit is 0 (only one VL) */
-		; /* VLArbitrationHighCap is 0 (only one VL) */
-	/*
-	 * Note: the chips support a maximum MTU of 4096, but the driver
-	 * hasn't implemented this feature yet, so set the maximum
-	 * to 2048.
-	 */
-	portinfo[10] = 	/* VLArbitrationLowCap is 0 (only one VL) */
-		/* InitTypeReply is SMA decision */
-		(4 << 16)	/* MTUCap 2048 */
-		| (7 << 13)	/* VLStallCount */
-		| (0x1f << 8)	/* HOQLife */
-		| (1 << 4)
-
-		/* OperationalVLs 0 */
-		/* PartitionEnforcementInbound */
-		/* PartitionEnforcementOutbound not enforced */
-		/* FilterRawinbound not enforced */
-		;		/* FilterRawOutbound not enforced */
-	/* M_KeyViolations are not counted by hardware, SMA can count */
-	tmp = ipath_read_creg32(dd, dd->ipath_cregs->cr_errpkey);
-	/* P_KeyViolations are counted by hardware. */
-	portinfo[11] = ((tmp & 0xffff) << 0);
-	portinfo[12] =
-		/* Q_KeyViolations are not counted by hardware */
-		(1 << 8)
-
-		/* GUIDCap */
-		/* SubnetTimeOut handled by SMA */
-		/* RespTimeValue handled by SMA */
-		;
-	/* LocalPhyErrors are programmed to max */
-	portinfo[12] |= (0xf << 20)
-		| (0xf << 16)   /* OverRunErrors are programmed to max */
-		;
-
-	return simple_read_from_buffer(buf, count, ppos, portinfo,
-				       sizeof portinfo);
-}
-
-static const struct file_operations atomic_port_info_ops = {
-	.read = atomic_port_info_read,
-};
-
 static ssize_t flash_read(struct file *file, char __user *buf,
 			  size_t count, loff_t *ppos)
 {
@@ -427,22 +258,6 @@ static int create_device_files(struct super_block *sb,
 		goto bail;
 	}
 
-	ret = create_file("node_info", S_IFREG|S_IRUGO, dir, &tmp,
-			  &atomic_node_info_ops, dd);
-	if (ret) {
-		printk(KERN_ERR "create_file(%s/node_info) "
-		       "failed: %d\n", unit, ret);
-		goto bail;
-	}
-
-	ret = create_file("port_info", S_IFREG|S_IRUGO, dir, &tmp,
-			  &atomic_port_info_ops, dd);
-	if (ret) {
-		printk(KERN_ERR "create_file(%s/port_info) "
-		       "failed: %d\n", unit, ret);
-		goto bail;
-	}
-
 	ret = create_file("flash", S_IFREG|S_IWUSR|S_IRUGO, dir, &tmp,
 			  &flash_ops, dd);
 	if (ret) {
@@ -508,8 +323,6 @@ static int remove_device_files(struct super_block *sb,
 	}
 
 	remove_file(dir, "flash");
-	remove_file(dir, "port_info");
-	remove_file(dir, "node_info");
 	remove_file(dir, "atomic_counters");
 	d_delete(dir);
 	ret = simple_rmdir(root->d_inode, dir);
diff --git a/drivers/infiniband/hw/ipath/ipath_iba6110.c b/drivers/infiniband/hw/ipath/ipath_iba6110.c
index 650745d83fac..ddbebe4bdb27 100644
--- a/drivers/infiniband/hw/ipath/ipath_iba6110.c
+++ b/drivers/infiniband/hw/ipath/ipath_iba6110.c
@@ -631,56 +631,35 @@ static int ipath_ht_boardname(struct ipath_devdata *dd, char *name,
 {
 	char *n = NULL;
 	u8 boardrev = dd->ipath_boardrev;
-	int ret;
+	int ret = 0;
 
 	switch (boardrev) {
-	case 4:		/* Ponderosa is one of the bringup boards */
-		n = "Ponderosa";
-		break;
 	case 5:
 		/*
 		 * original production board; two production levels, with
 		 * different serial number ranges.   See ipath_ht_early_init() for
 		 * case where we enable IPATH_GPIO_INTR for later serial # range.
+		 * Original 112* serial number is no longer supported.
 		 */
 		n = "InfiniPath_QHT7040";
 		break;
-	case 6:
-		n = "OEM_Board_3";
-		break;
 	case 7:
 		/* small form factor production board */
 		n = "InfiniPath_QHT7140";
 		break;
-	case 8:
-		n = "LS/X-1";
-		break;
-	case 9:		/* Comstock bringup test board */
-		n = "Comstock";
-		break;
-	case 10:
-		n = "OEM_Board_2";
-		break;
-	case 11:
-		n = "InfiniPath_HT-470"; /* obsoleted */
-		break;
-	case 12:
-		n = "OEM_Board_4";
-		break;
 	default:		/* don't know, just print the number */
 		ipath_dev_err(dd, "Don't yet know about board "
 			      "with ID %u\n", boardrev);
 		snprintf(name, namelen, "Unknown_InfiniPath_QHT7xxx_%u",
 			 boardrev);
+		ret = 1;
 		break;
 	}
 	if (n)
 		snprintf(name, namelen, "%s", n);
 
-	if (dd->ipath_boardrev != 6 && dd->ipath_boardrev != 7 &&
-	    dd->ipath_boardrev != 11) {
+	if (ret) {
 		ipath_dev_err(dd, "Unsupported InfiniPath board %s!\n", name);
-		ret = 1;
 		goto bail;
 	}
 	if (dd->ipath_majrev != 3 || (dd->ipath_minrev < 2 ||
@@ -1554,10 +1533,25 @@ static int ipath_ht_early_init(struct ipath_devdata *dd)
 		 * can use GPIO interrupts.  They have serial #'s starting
 		 * with 128, rather than 112.
 		 */
-		dd->ipath_flags |= IPATH_GPIO_INTR;
-	} else
-		ipath_dev_err(dd, "Unsupported InfiniPath serial "
-			      "number %.16s!\n", dd->ipath_serial);
+		if (dd->ipath_serial[0] == '1' &&
+		    dd->ipath_serial[1] == '2' &&
+		    dd->ipath_serial[2] == '8')
+			dd->ipath_flags |= IPATH_GPIO_INTR;
+		else {
+			ipath_dev_err(dd, "Unsupported InfiniPath board "
+				"(serial number %.16s)!\n",
+				dd->ipath_serial);
+			return 1;
+		}
+	}
+
+	if (dd->ipath_minrev >= 4) {
+		/* Rev4+ reports extra errors via internal GPIO pins */
+		dd->ipath_flags |= IPATH_GPIO_ERRINTRS;
+		dd->ipath_gpio_mask |= IPATH_GPIO_ERRINTR_MASK;
+		ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
+				 dd->ipath_gpio_mask);
+	}
 
 	return 0;
 }
@@ -1592,7 +1586,10 @@ static int ipath_ht_get_base_info(struct ipath_portdata *pd, void *kbase)
 	struct ipath_base_info *kinfo = kbase;
 
 	kinfo->spi_runtime_flags |= IPATH_RUNTIME_HT |
-		IPATH_RUNTIME_RCVHDR_COPY;
+		IPATH_RUNTIME_PIO_REGSWAPPED;
+
+	if (pd->port_dd->ipath_minrev < 4)
+		kinfo->spi_runtime_flags |= IPATH_RUNTIME_RCVHDR_COPY;
 
 	return 0;
 }
diff --git a/drivers/infiniband/hw/ipath/ipath_iba6120.c b/drivers/infiniband/hw/ipath/ipath_iba6120.c
index 5b6ac9a1a709..0103d6f4847b 100644
--- a/drivers/infiniband/hw/ipath/ipath_iba6120.c
+++ b/drivers/infiniband/hw/ipath/ipath_iba6120.c
@@ -1143,11 +1143,14 @@ static void ipath_pe_put_tid(struct ipath_devdata *dd, u64 __iomem *tidptr,
 			pa |= 2 << 29;
 	}
 
-	/* workaround chip bug 9437 by writing each TID twice
-	 * and holding a spinlock around the writes, so they don't
-	 * intermix with other TID (eager or expected) writes
-	 * Unfortunately, this call can be done from interrupt level
-	 * for the port 0 eager TIDs, so we have to use irqsave
+	/*
+	 * Workaround chip bug 9437 by writing the scratch register
+	 * before and after the TID, and with an io write barrier.
+	 * We use a spinlock around the writes, so they can't intermix
+	 * with other TID (eager or expected) writes (the chip bug
+	 * is triggered by back to back TID writes). Unfortunately, this
+	 * call can be done from interrupt level for the port 0 eager TIDs,
+	 * so we have to use irqsave locks.
 	 */
 	spin_lock_irqsave(&dd->ipath_tid_lock, flags);
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_scratch, 0xfeeddeaf);
@@ -1273,6 +1276,8 @@ static void ipath_pe_tidtemplate(struct ipath_devdata *dd)
 static int ipath_pe_early_init(struct ipath_devdata *dd)
 {
 	dd->ipath_flags |= IPATH_4BYTE_TID;
+	if (ipath_unordered_wc())
+		dd->ipath_flags |= IPATH_PIO_FLUSH_WC;
 
 	/*
 	 * For openfabrics, we need to be able to handle an IB header of
@@ -1343,7 +1348,8 @@ static int ipath_pe_get_base_info(struct ipath_portdata *pd, void *kbase)
 	dd = pd->port_dd;
 
 done:
-	kinfo->spi_runtime_flags |= IPATH_RUNTIME_PCIE;
+	kinfo->spi_runtime_flags |= IPATH_RUNTIME_PCIE |
+		IPATH_RUNTIME_FORCE_PIOAVAIL | IPATH_RUNTIME_PIO_REGSWAPPED;
 	return 0;
 }
 
diff --git a/drivers/infiniband/hw/ipath/ipath_intr.c b/drivers/infiniband/hw/ipath/ipath_intr.c
index b29fe7e9b11a..6a5dd5cd773d 100644
--- a/drivers/infiniband/hw/ipath/ipath_intr.c
+++ b/drivers/infiniband/hw/ipath/ipath_intr.c
@@ -275,6 +275,16 @@ static char *ib_linkstate(u32 linkstate)
 	return ret;
 }
 
+void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev)
+{
+	struct ib_event event;
+
+	event.device = &dd->verbs_dev->ibdev;
+	event.element.port_num = 1;
+	event.event = ev;
+	ib_dispatch_event(&event);
+}
+
 static void handle_e_ibstatuschanged(struct ipath_devdata *dd,
 				     ipath_err_t errs, int noprint)
 {
@@ -373,6 +383,8 @@ static void handle_e_ibstatuschanged(struct ipath_devdata *dd,
 	dd->ipath_ibpollcnt = 0;	/* some state other than 2 or 3 */
 	ipath_stats.sps_iblink++;
 	if (ltstate != INFINIPATH_IBCS_LT_STATE_LINKUP) {
+		if (dd->ipath_flags & IPATH_LINKACTIVE)
+			signal_ib_event(dd, IB_EVENT_PORT_ERR);
 		dd->ipath_flags |= IPATH_LINKDOWN;
 		dd->ipath_flags &= ~(IPATH_LINKUNK | IPATH_LINKINIT
 				     | IPATH_LINKACTIVE |
@@ -405,7 +417,10 @@ static void handle_e_ibstatuschanged(struct ipath_devdata *dd,
 		*dd->ipath_statusp |=
 			IPATH_STATUS_IB_READY | IPATH_STATUS_IB_CONF;
 		dd->ipath_f_setextled(dd, lstate, ltstate);
+		signal_ib_event(dd, IB_EVENT_PORT_ACTIVE);
 	} else if ((val & IPATH_IBSTATE_MASK) == IPATH_IBSTATE_INIT) {
+		if (dd->ipath_flags & IPATH_LINKACTIVE)
+			signal_ib_event(dd, IB_EVENT_PORT_ERR);
 		/*
 		 * set INIT and DOWN.  Down is checked by most of the other
 		 * code, but INIT is useful to know in a few places.
@@ -418,6 +433,8 @@ static void handle_e_ibstatuschanged(struct ipath_devdata *dd,
 					| IPATH_STATUS_IB_READY);
 		dd->ipath_f_setextled(dd, lstate, ltstate);
 	} else if ((val & IPATH_IBSTATE_MASK) == IPATH_IBSTATE_ARM) {
+		if (dd->ipath_flags & IPATH_LINKACTIVE)
+			signal_ib_event(dd, IB_EVENT_PORT_ERR);
 		dd->ipath_flags |= IPATH_LINKARMED;
 		dd->ipath_flags &=
 			~(IPATH_LINKUNK | IPATH_LINKDOWN | IPATH_LINKINIT |
@@ -688,17 +705,9 @@ static int handle_errors(struct ipath_devdata *dd, ipath_err_t errs)
 					chkerrpkts = 1;
 				dd->ipath_lastrcvhdrqtails[i] = tl;
 				pd->port_hdrqfull++;
-				if (test_bit(IPATH_PORT_WAITING_OVERFLOW,
-					     &pd->port_flag)) {
-					clear_bit(
-					  IPATH_PORT_WAITING_OVERFLOW,
-					  &pd->port_flag);
-					set_bit(
-					  IPATH_PORT_WAITING_OVERFLOW,
-					  &pd->int_flag);
-					wake_up_interruptible(
-					  &pd->port_wait);
-				}
+				/* flush hdrqfull so that poll() sees it */
+				wmb();
+				wake_up_interruptible(&pd->port_wait);
 			}
 		}
 	}
@@ -960,6 +969,8 @@ static void handle_urcv(struct ipath_devdata *dd, u32 istat)
 	int i;
 	int rcvdint = 0;
 
+	/* test_bit below needs this... */
+	rmb();
 	portr = ((istat >> INFINIPATH_I_RCVAVAIL_SHIFT) &
 		 dd->ipath_i_rcvavail_mask)
 		| ((istat >> INFINIPATH_I_RCVURG_SHIFT) &
@@ -967,22 +978,15 @@ static void handle_urcv(struct ipath_devdata *dd, u32 istat)
 	for (i = 1; i < dd->ipath_cfgports; i++) {
 		struct ipath_portdata *pd = dd->ipath_pd[i];
 		if (portr & (1 << i) && pd && pd->port_cnt) {
-			if (test_bit(IPATH_PORT_WAITING_RCV,
-				     &pd->port_flag)) {
-				clear_bit(IPATH_PORT_WAITING_RCV,
-					  &pd->port_flag);
-				set_bit(IPATH_PORT_WAITING_RCV,
-					&pd->int_flag);
+			if (test_and_clear_bit(IPATH_PORT_WAITING_RCV,
+					       &pd->port_flag)) {
 				clear_bit(i + INFINIPATH_R_INTRAVAIL_SHIFT,
 					  &dd->ipath_rcvctrl);
 				wake_up_interruptible(&pd->port_wait);
 				rcvdint = 1;
-			} else if (test_bit(IPATH_PORT_WAITING_URG,
-					    &pd->port_flag)) {
-				clear_bit(IPATH_PORT_WAITING_URG,
-					  &pd->port_flag);
-				set_bit(IPATH_PORT_WAITING_URG,
-					&pd->int_flag);
+			} else if (test_and_clear_bit(IPATH_PORT_WAITING_URG,
+						      &pd->port_flag)) {
+				pd->port_urgent++;
 				wake_up_interruptible(&pd->port_wait);
 			}
 		}
@@ -1085,8 +1089,8 @@ irqreturn_t ipath_intr(int irq, void *data)
 		 * GPIO_2 indicates (on some HT4xx boards) that a packet
 		 *        has arrived for Port 0. Checking for this
 		 *        is controlled by flag IPATH_GPIO_INTR.
-		 * GPIO_3..5 on IBA6120 Rev2 chips indicate errors
-		 *        that we need to count. Checking for this
+		 * GPIO_3..5 on IBA6120 Rev2 and IBA6110 Rev4 chips indicate
+		 *        errors that we need to count. Checking for this
 		 *        is controlled by flag IPATH_GPIO_ERRINTRS.
 		 */
 		u32 gpiostatus;
@@ -1137,10 +1141,8 @@ irqreturn_t ipath_intr(int irq, void *data)
 			/*
 			 * Some unexpected bits remain. If they could have
 			 * caused the interrupt, complain and clear.
-			 * MEA: this is almost certainly non-ideal.
-			 * we should look into auto-disable of unexpected
-			 * GPIO interrupts, possibly on a "three strikes"
-			 * basis.
+			 * To avoid repetition of this condition, also clear
+			 * the mask. It is almost certainly due to error.
 			 */
 			const u32 mask = (u32) dd->ipath_gpio_mask;
 
@@ -1148,6 +1150,10 @@ irqreturn_t ipath_intr(int irq, void *data)
 				ipath_dbg("Unexpected GPIO IRQ bits %x\n",
 				  gpiostatus & mask);
 				to_clear |= (gpiostatus & mask);
+				dd->ipath_gpio_mask &= ~(gpiostatus & mask);
+				ipath_write_kreg(dd,
+					dd->ipath_kregs->kr_gpio_mask,
+					dd->ipath_gpio_mask);
 			}
 		}
 		if (to_clear) {
diff --git a/drivers/infiniband/hw/ipath/ipath_kernel.h b/drivers/infiniband/hw/ipath/ipath_kernel.h
index 7a7966f7e4ff..8786dd7922e4 100644
--- a/drivers/infiniband/hw/ipath/ipath_kernel.h
+++ b/drivers/infiniband/hw/ipath/ipath_kernel.h
@@ -42,6 +42,7 @@
 #include <linux/pci.h>
 #include <linux/dma-mapping.h>
 #include <asm/io.h>
+#include <rdma/ib_verbs.h>
 
 #include "ipath_common.h"
 #include "ipath_debug.h"
@@ -139,6 +140,12 @@ struct ipath_portdata {
 	u32 port_pionowait;
 	/* total number of rcvhdrqfull errors */
 	u32 port_hdrqfull;
+	/* saved total number of rcvhdrqfull errors for poll edge trigger */
+	u32 port_hdrqfull_poll;
+	/* total number of polled urgent packets */
+	u32 port_urgent;
+	/* saved total number of polled urgent packets for poll edge trigger */
+	u32 port_urgent_poll;
 	/* pid of process using this port */
 	pid_t port_pid;
 	/* same size as task_struct .comm[] */
@@ -724,6 +731,8 @@ int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv);
 #define IPATH_LINKACTIVE    0x200
 		/* link current state is unknown */
 #define IPATH_LINKUNK       0x400
+		/* Write combining flush needed for PIO */
+#define IPATH_PIO_FLUSH_WC  0x1000
 		/* no IB cable, or no device on IB cable */
 #define IPATH_NOCABLE       0x4000
 		/* Supports port zero per packet receive interrupts via
@@ -755,8 +764,6 @@ int ipath_set_rx_pol_inv(struct ipath_devdata *dd, u8 new_pol_inv);
 #define IPATH_PORT_MASTER_UNINIT 4
 		/* waiting for an urgent packet to arrive */
 #define IPATH_PORT_WAITING_URG 5
-		/* waiting for a header overflow */
-#define IPATH_PORT_WAITING_OVERFLOW 6
 
 /* free up any allocated data at closes */
 void ipath_free_data(struct ipath_portdata *dd);
@@ -769,6 +776,7 @@ void ipath_get_eeprom_info(struct ipath_devdata *);
 int ipath_update_eeprom_log(struct ipath_devdata *dd);
 void ipath_inc_eeprom_err(struct ipath_devdata *dd, u32 eidx, u32 incr);
 u64 ipath_snap_cntr(struct ipath_devdata *, ipath_creg);
+void signal_ib_event(struct ipath_devdata *dd, enum ib_event_type ev);
 
 /*
  * Set LED override, only the two LSBs have "public" meaning, but
diff --git a/drivers/infiniband/hw/ipath/ipath_mad.c b/drivers/infiniband/hw/ipath/ipath_mad.c
index d61c03044545..3d1432d1e3f4 100644
--- a/drivers/infiniband/hw/ipath/ipath_mad.c
+++ b/drivers/infiniband/hw/ipath/ipath_mad.c
@@ -245,7 +245,7 @@ static int recv_subn_get_portinfo(struct ib_smp *smp,
 
 	/* Only return the mkey if the protection field allows it. */
 	if (smp->method == IB_MGMT_METHOD_SET || dev->mkey == smp->mkey ||
-	    (dev->mkeyprot_resv_lmc >> 6) == 0)
+	    dev->mkeyprot == 0)
 		pip->mkey = dev->mkey;
 	pip->gid_prefix = dev->gid_prefix;
 	lid = dev->dd->ipath_lid;
@@ -264,7 +264,7 @@ static int recv_subn_get_portinfo(struct ib_smp *smp,
 	pip->portphysstate_linkdown =
 		(ipath_cvt_physportstate[ibcstat & 0xf] << 4) |
 		(get_linkdowndefaultstate(dev->dd) ? 1 : 2);
-	pip->mkeyprot_resv_lmc = dev->mkeyprot_resv_lmc;
+	pip->mkeyprot_resv_lmc = (dev->mkeyprot << 6) | dev->dd->ipath_lmc;
 	pip->linkspeedactive_enabled = 0x11;	/* 2.5Gbps, 2.5Gbps */
 	switch (dev->dd->ipath_ibmtu) {
 	case 4096:
@@ -401,7 +401,7 @@ static int recv_subn_set_portinfo(struct ib_smp *smp,
 	struct ib_port_info *pip = (struct ib_port_info *)smp->data;
 	struct ib_event event;
 	struct ipath_ibdev *dev;
-	u32 flags;
+	struct ipath_devdata *dd;
 	char clientrereg = 0;
 	u16 lid, smlid;
 	u8 lwe;
@@ -415,6 +415,7 @@ static int recv_subn_set_portinfo(struct ib_smp *smp,
 		goto err;
 
 	dev = to_idev(ibdev);
+	dd = dev->dd;
 	event.device = ibdev;
 	event.element.port_num = port;
 
@@ -423,11 +424,12 @@ static int recv_subn_set_portinfo(struct ib_smp *smp,
 	dev->mkey_lease_period = be16_to_cpu(pip->mkey_lease_period);
 
 	lid = be16_to_cpu(pip->lid);
-	if (lid != dev->dd->ipath_lid) {
+	if (dd->ipath_lid != lid ||
+	    dd->ipath_lmc != (pip->mkeyprot_resv_lmc & 7)) {
 		/* Must be a valid unicast LID address. */
 		if (lid == 0 || lid >= IPATH_MULTICAST_LID_BASE)
 			goto err;
-		ipath_set_lid(dev->dd, lid, pip->mkeyprot_resv_lmc & 7);
+		ipath_set_lid(dd, lid, pip->mkeyprot_resv_lmc & 7);
 		event.event = IB_EVENT_LID_CHANGE;
 		ib_dispatch_event(&event);
 	}
@@ -461,18 +463,18 @@ static int recv_subn_set_portinfo(struct ib_smp *smp,
 	case 0: /* NOP */
 		break;
 	case 1: /* SLEEP */
-		if (set_linkdowndefaultstate(dev->dd, 1))
+		if (set_linkdowndefaultstate(dd, 1))
 			goto err;
 		break;
 	case 2: /* POLL */
-		if (set_linkdowndefaultstate(dev->dd, 0))
+		if (set_linkdowndefaultstate(dd, 0))
 			goto err;
 		break;
 	default:
 		goto err;
 	}
 
-	dev->mkeyprot_resv_lmc = pip->mkeyprot_resv_lmc;
+	dev->mkeyprot = pip->mkeyprot_resv_lmc >> 6;
 	dev->vl_high_limit = pip->vl_high_limit;
 
 	switch ((pip->neighbormtu_mastersmsl >> 4) & 0xF) {
@@ -495,7 +497,7 @@ static int recv_subn_set_portinfo(struct ib_smp *smp,
 		/* XXX We have already partially updated our state! */
 		goto err;
 	}
-	ipath_set_mtu(dev->dd, mtu);
+	ipath_set_mtu(dd, mtu);
 
 	dev->sm_sl = pip->neighbormtu_mastersmsl & 0xF;
 
@@ -511,16 +513,16 @@ static int recv_subn_set_portinfo(struct ib_smp *smp,
 	 * later.
 	 */
 	if (pip->pkey_violations == 0)
-		dev->z_pkey_violations = ipath_get_cr_errpkey(dev->dd);
+		dev->z_pkey_violations = ipath_get_cr_errpkey(dd);
 
 	if (pip->qkey_violations == 0)
 		dev->qkey_violations = 0;
 
 	ore = pip->localphyerrors_overrunerrors;
-	if (set_phyerrthreshold(dev->dd, (ore >> 4) & 0xF))
+	if (set_phyerrthreshold(dd, (ore >> 4) & 0xF))
 		goto err;
 
-	if (set_overrunthreshold(dev->dd, (ore & 0xF)))
+	if (set_overrunthreshold(dd, (ore & 0xF)))
 		goto err;
 
 	dev->subnet_timeout = pip->clientrereg_resv_subnetto & 0x1F;
@@ -538,7 +540,6 @@ static int recv_subn_set_portinfo(struct ib_smp *smp,
 	 * is down or is being set to down.
 	 */
 	state = pip->linkspeed_portstate & 0xF;
-	flags = dev->dd->ipath_flags;
 	lstate = (pip->portphysstate_linkdown >> 4) & 0xF;
 	if (lstate && !(state == IB_PORT_DOWN || state == IB_PORT_NOP))
 		goto err;
@@ -554,7 +555,7 @@ static int recv_subn_set_portinfo(struct ib_smp *smp,
 		/* FALLTHROUGH */
 	case IB_PORT_DOWN:
 		if (lstate == 0)
-			if (get_linkdowndefaultstate(dev->dd))
+			if (get_linkdowndefaultstate(dd))
 				lstate = IPATH_IB_LINKDOWN_SLEEP;
 			else
 				lstate = IPATH_IB_LINKDOWN;
@@ -566,27 +567,13 @@ static int recv_subn_set_portinfo(struct ib_smp *smp,
 			lstate = IPATH_IB_LINKDOWN_DISABLE;
 		else
 			goto err;
-		ipath_set_linkstate(dev->dd, lstate);
-		if (flags & IPATH_LINKACTIVE) {
-			event.event = IB_EVENT_PORT_ERR;
-			ib_dispatch_event(&event);
-		}
+		ipath_set_linkstate(dd, lstate);
 		break;
 	case IB_PORT_ARMED:
-		if (!(flags & (IPATH_LINKINIT | IPATH_LINKACTIVE)))
-			break;
-		ipath_set_linkstate(dev->dd, IPATH_IB_LINKARM);
-		if (flags & IPATH_LINKACTIVE) {
-			event.event = IB_EVENT_PORT_ERR;
-			ib_dispatch_event(&event);
-		}
+		ipath_set_linkstate(dd, IPATH_IB_LINKARM);
 		break;
 	case IB_PORT_ACTIVE:
-		if (!(flags & IPATH_LINKARMED))
-			break;
-		ipath_set_linkstate(dev->dd, IPATH_IB_LINKACTIVE);
-		event.event = IB_EVENT_PORT_ACTIVE;
-		ib_dispatch_event(&event);
+		ipath_set_linkstate(dd, IPATH_IB_LINKACTIVE);
 		break;
 	default:
 		/* XXX We have already partially updated our state! */
@@ -1350,7 +1337,7 @@ static int process_subn(struct ib_device *ibdev, int mad_flags,
 	if (dev->mkey_lease_timeout && jiffies >= dev->mkey_lease_timeout) {
 		/* Clear timeout and mkey protection field. */
 		dev->mkey_lease_timeout = 0;
-		dev->mkeyprot_resv_lmc &= 0x3F;
+		dev->mkeyprot = 0;
 	}
 
 	/*
@@ -1361,7 +1348,7 @@ static int process_subn(struct ib_device *ibdev, int mad_flags,
 	    dev->mkey != smp->mkey &&
 	    (smp->method == IB_MGMT_METHOD_SET ||
 	     (smp->method == IB_MGMT_METHOD_GET &&
-	      (dev->mkeyprot_resv_lmc >> 7) != 0))) {
+	      dev->mkeyprot >= 2))) {
 		if (dev->mkey_violations != 0xFFFF)
 			++dev->mkey_violations;
 		if (dev->mkey_lease_timeout ||
diff --git a/drivers/infiniband/hw/ipath/ipath_qp.c b/drivers/infiniband/hw/ipath/ipath_qp.c
index 1324b35ff1f8..6a41fdbc8e57 100644
--- a/drivers/infiniband/hw/ipath/ipath_qp.c
+++ b/drivers/infiniband/hw/ipath/ipath_qp.c
@@ -338,6 +338,7 @@ static void ipath_reset_qp(struct ipath_qp *qp)
 	qp->s_busy = 0;
 	qp->s_flags &= IPATH_S_SIGNAL_REQ_WR;
 	qp->s_hdrwords = 0;
+	qp->s_wqe = NULL;
 	qp->s_psn = 0;
 	qp->r_psn = 0;
 	qp->r_msn = 0;
@@ -376,13 +377,15 @@ static void ipath_reset_qp(struct ipath_qp *qp)
  * @err: the receive completion error to signal if a RWQE is active
  *
  * Flushes both send and receive work queues.
+ * Returns true if last WQE event should be generated.
  * The QP s_lock should be held and interrupts disabled.
  */
 
-void ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err)
+int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err)
 {
 	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
 	struct ib_wc wc;
+	int ret = 0;
 
 	ipath_dbg("QP%d/%d in error state\n",
 		  qp->ibqp.qp_num, qp->remote_qpn);
@@ -453,7 +456,10 @@ void ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err)
 		wq->tail = tail;
 
 		spin_unlock(&qp->r_rq.lock);
-	}
+	} else if (qp->ibqp.event_handler)
+		ret = 1;
+
+	return ret;
 }
 
 /**
@@ -472,6 +478,7 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	struct ipath_qp *qp = to_iqp(ibqp);
 	enum ib_qp_state cur_state, new_state;
 	unsigned long flags;
+	int lastwqe = 0;
 	int ret;
 
 	spin_lock_irqsave(&qp->s_lock, flags);
@@ -531,7 +538,7 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		break;
 
 	case IB_QPS_ERR:
-		ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
+		lastwqe = ipath_error_qp(qp, IB_WC_WR_FLUSH_ERR);
 		break;
 
 	default:
@@ -590,6 +597,14 @@ int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 	qp->state = new_state;
 	spin_unlock_irqrestore(&qp->s_lock, flags);
 
+	if (lastwqe) {
+		struct ib_event ev;
+
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
 	ret = 0;
 	goto bail;
 
@@ -751,6 +766,9 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
 	switch (init_attr->qp_type) {
 	case IB_QPT_UC:
 	case IB_QPT_RC:
+	case IB_QPT_UD:
+	case IB_QPT_SMI:
+	case IB_QPT_GSI:
 		sz = sizeof(struct ipath_sge) *
 			init_attr->cap.max_send_sge +
 			sizeof(struct ipath_swqe);
@@ -759,10 +777,6 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
 			ret = ERR_PTR(-ENOMEM);
 			goto bail;
 		}
-		/* FALLTHROUGH */
-	case IB_QPT_UD:
-	case IB_QPT_SMI:
-	case IB_QPT_GSI:
 		sz = sizeof(*qp);
 		if (init_attr->srq) {
 			struct ipath_srq *srq = to_isrq(init_attr->srq);
@@ -805,8 +819,7 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
 		spin_lock_init(&qp->r_rq.lock);
 		atomic_set(&qp->refcount, 0);
 		init_waitqueue_head(&qp->wait);
-		tasklet_init(&qp->s_task, ipath_do_ruc_send,
-			     (unsigned long)qp);
+		tasklet_init(&qp->s_task, ipath_do_send, (unsigned long)qp);
 		INIT_LIST_HEAD(&qp->piowait);
 		INIT_LIST_HEAD(&qp->timerwait);
 		qp->state = IB_QPS_RESET;
diff --git a/drivers/infiniband/hw/ipath/ipath_rc.c b/drivers/infiniband/hw/ipath/ipath_rc.c
index 46744ea2babd..5c29b2bfea17 100644
--- a/drivers/infiniband/hw/ipath/ipath_rc.c
+++ b/drivers/infiniband/hw/ipath/ipath_rc.c
@@ -81,9 +81,8 @@ static void ipath_init_restart(struct ipath_qp *qp, struct ipath_swqe *wqe)
  * Note that we are in the responder's side of the QP context.
  * Note the QP s_lock must be held.
  */
-static int ipath_make_rc_ack(struct ipath_qp *qp,
-			     struct ipath_other_headers *ohdr,
-			     u32 pmtu, u32 *bth0p, u32 *bth2p)
+static int ipath_make_rc_ack(struct ipath_ibdev *dev, struct ipath_qp *qp,
+			     struct ipath_other_headers *ohdr, u32 pmtu)
 {
 	struct ipath_ack_entry *e;
 	u32 hwords;
@@ -192,8 +191,7 @@ static int ipath_make_rc_ack(struct ipath_qp *qp,
 	}
 	qp->s_hdrwords = hwords;
 	qp->s_cur_size = len;
-	*bth0p = bth0 | (1 << 22); /* Set M bit */
-	*bth2p = bth2;
+	ipath_make_ruc_header(dev, qp, ohdr, bth0, bth2);
 	return 1;
 
 bail:
@@ -203,32 +201,39 @@ bail:
 /**
  * ipath_make_rc_req - construct a request packet (SEND, RDMA r/w, ATOMIC)
  * @qp: a pointer to the QP
- * @ohdr: a pointer to the IB header being constructed
- * @pmtu: the path MTU
- * @bth0p: pointer to the BTH opcode word
- * @bth2p: pointer to the BTH PSN word
  *
  * Return 1 if constructed; otherwise, return 0.
- * Note the QP s_lock must be held and interrupts disabled.
  */
-int ipath_make_rc_req(struct ipath_qp *qp,
-		      struct ipath_other_headers *ohdr,
-		      u32 pmtu, u32 *bth0p, u32 *bth2p)
+int ipath_make_rc_req(struct ipath_qp *qp)
 {
 	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
+	struct ipath_other_headers *ohdr;
 	struct ipath_sge_state *ss;
 	struct ipath_swqe *wqe;
 	u32 hwords;
 	u32 len;
 	u32 bth0;
 	u32 bth2;
+	u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
 	char newreq;
+	unsigned long flags;
+	int ret = 0;
+
+	ohdr = &qp->s_hdr.u.oth;
+	if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+		ohdr = &qp->s_hdr.u.l.oth;
+
+	/*
+	 * The lock is needed to synchronize between the sending tasklet,
+	 * the receive interrupt handler, and timeout resends.
+	 */
+	spin_lock_irqsave(&qp->s_lock, flags);
 
 	/* Sending responses has higher priority over sending requests. */
 	if ((qp->r_head_ack_queue != qp->s_tail_ack_queue ||
 	     (qp->s_flags & IPATH_S_ACK_PENDING) ||
 	     qp->s_ack_state != OP(ACKNOWLEDGE)) &&
-	    ipath_make_rc_ack(qp, ohdr, pmtu, bth0p, bth2p))
+	    ipath_make_rc_ack(dev, qp, ohdr, pmtu))
 		goto done;
 
 	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK) ||
@@ -560,13 +565,12 @@ int ipath_make_rc_req(struct ipath_qp *qp,
 	qp->s_hdrwords = hwords;
 	qp->s_cur_sge = ss;
 	qp->s_cur_size = len;
-	*bth0p = bth0 | (qp->s_state << 24);
-	*bth2p = bth2;
+	ipath_make_ruc_header(dev, qp, ohdr, bth0 | (qp->s_state << 24), bth2);
 done:
-	return 1;
-
+	ret = 1;
 bail:
-	return 0;
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
 }
 
 /**
@@ -627,7 +631,7 @@ static void send_rc_ack(struct ipath_qp *qp)
 	/*
 	 * If we can send the ACK, clear the ACK state.
 	 */
-	if (ipath_verbs_send(dev->dd, hwords, (u32 *) &hdr, 0, NULL) == 0) {
+	if (ipath_verbs_send(qp, &hdr, hwords, NULL, 0) == 0) {
 		dev->n_unicast_xmit++;
 		goto done;
 	}
@@ -757,7 +761,9 @@ void ipath_restart_rc(struct ipath_qp *qp, u32 psn, struct ib_wc *wc)
 		wc->vendor_err = 0;
 		wc->byte_len = 0;
 		wc->qp = &qp->ibqp;
+		wc->imm_data = 0;
 		wc->src_qp = qp->remote_qpn;
+		wc->wc_flags = 0;
 		wc->pkey_index = 0;
 		wc->slid = qp->remote_ah_attr.dlid;
 		wc->sl = qp->remote_ah_attr.sl;
@@ -1041,7 +1047,9 @@ static int do_rc_ack(struct ipath_qp *qp, u32 aeth, u32 psn, int opcode,
 			wc.vendor_err = 0;
 			wc.byte_len = 0;
 			wc.qp = &qp->ibqp;
+			wc.imm_data = 0;
 			wc.src_qp = qp->remote_qpn;
+			wc.wc_flags = 0;
 			wc.pkey_index = 0;
 			wc.slid = qp->remote_ah_attr.dlid;
 			wc.sl = qp->remote_ah_attr.sl;
@@ -1454,6 +1462,19 @@ static inline int ipath_rc_rcv_error(struct ipath_ibdev *dev,
 			goto send_ack;
 		}
 		/*
+		 * Try to send a simple ACK to work around a Mellanox bug
+		 * which doesn't accept a RDMA read response or atomic
+		 * response as an ACK for earlier SENDs or RDMA writes.
+		 */
+		if (qp->r_head_ack_queue == qp->s_tail_ack_queue &&
+		    !(qp->s_flags & IPATH_S_ACK_PENDING) &&
+		    qp->s_ack_state == OP(ACKNOWLEDGE)) {
+			spin_unlock_irqrestore(&qp->s_lock, flags);
+			qp->r_nak_state = 0;
+			qp->r_ack_psn = qp->s_ack_queue[i].psn - 1;
+			goto send_ack;
+		}
+		/*
 		 * Resend the RDMA read or atomic op which
 		 * ACKs this duplicate request.
 		 */
@@ -1476,11 +1497,21 @@ send_ack:
 static void ipath_rc_error(struct ipath_qp *qp, enum ib_wc_status err)
 {
 	unsigned long flags;
+	int lastwqe;
 
 	spin_lock_irqsave(&qp->s_lock, flags);
 	qp->state = IB_QPS_ERR;
-	ipath_error_qp(qp, err);
+	lastwqe = ipath_error_qp(qp, err);
 	spin_unlock_irqrestore(&qp->s_lock, flags);
+
+	if (lastwqe) {
+		struct ib_event ev;
+
+		ev.device = qp->ibqp.device;
+		ev.element.qp = &qp->ibqp;
+		ev.event = IB_EVENT_QP_LAST_WQE_REACHED;
+		qp->ibqp.event_handler(&ev, qp->ibqp.qp_context);
+	}
 }
 
 static inline void ipath_update_ack_queue(struct ipath_qp *qp, unsigned n)
diff --git a/drivers/infiniband/hw/ipath/ipath_ruc.c b/drivers/infiniband/hw/ipath/ipath_ruc.c
index c69c25239443..4b6b7ee8e5c1 100644
--- a/drivers/infiniband/hw/ipath/ipath_ruc.c
+++ b/drivers/infiniband/hw/ipath/ipath_ruc.c
@@ -31,6 +31,8 @@
  * SOFTWARE.
  */
 
+#include <linux/spinlock.h>
+
 #include "ipath_verbs.h"
 #include "ipath_kernel.h"
 
@@ -106,27 +108,30 @@ void ipath_insert_rnr_queue(struct ipath_qp *qp)
 	spin_unlock_irqrestore(&dev->pending_lock, flags);
 }
 
-static int init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe)
+/**
+ * ipath_init_sge - Validate a RWQE and fill in the SGE state
+ * @qp: the QP
+ *
+ * Return 1 if OK.
+ */
+int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
+		   u32 *lengthp, struct ipath_sge_state *ss)
 {
-	int user = to_ipd(qp->ibqp.pd)->user;
 	int i, j, ret;
 	struct ib_wc wc;
 
-	qp->r_len = 0;
+	*lengthp = 0;
 	for (i = j = 0; i < wqe->num_sge; i++) {
 		if (wqe->sg_list[i].length == 0)
 			continue;
 		/* Check LKEY */
-		if ((user && wqe->sg_list[i].lkey == 0) ||
-		    !ipath_lkey_ok(qp, &qp->r_sg_list[j], &wqe->sg_list[i],
-				   IB_ACCESS_LOCAL_WRITE))
+		if (!ipath_lkey_ok(qp, j ? &ss->sg_list[j - 1] : &ss->sge,
+				   &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
 			goto bad_lkey;
-		qp->r_len += wqe->sg_list[i].length;
+		*lengthp += wqe->sg_list[i].length;
 		j++;
 	}
-	qp->r_sge.sge = qp->r_sg_list[0];
-	qp->r_sge.sg_list = qp->r_sg_list + 1;
-	qp->r_sge.num_sge = j;
+	ss->num_sge = j;
 	ret = 1;
 	goto bail;
 
@@ -172,6 +177,8 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
 	u32 tail;
 	int ret;
 
+	qp->r_sge.sg_list = qp->r_sg_list;
+
 	if (qp->ibqp.srq) {
 		srq = to_isrq(qp->ibqp.srq);
 		handler = srq->ibsrq.event_handler;
@@ -199,7 +206,8 @@ int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only)
 		wqe = get_rwqe_ptr(rq, tail);
 		if (++tail >= rq->size)
 			tail = 0;
-	} while (!wr_id_only && !init_sge(qp, wqe));
+	} while (!wr_id_only && !ipath_init_sge(qp, wqe, &qp->r_len,
+						&qp->r_sge));
 	qp->r_wr_id = wqe->wr_id;
 	wq->tail = tail;
 
@@ -239,9 +247,9 @@ bail:
 
 /**
  * ipath_ruc_loopback - handle UC and RC lookback requests
- * @sqp: the loopback QP
+ * @sqp: the sending QP
  *
- * This is called from ipath_do_uc_send() or ipath_do_rc_send() to
+ * This is called from ipath_do_send() to
  * forward a WQE addressed to the same HCA.
  * Note that although we are single threaded due to the tasklet, we still
  * have to protect against post_send().  We don't have to worry about
@@ -450,40 +458,18 @@ again:
 	wc.byte_len = wqe->length;
 	wc.qp = &qp->ibqp;
 	wc.src_qp = qp->remote_qpn;
-	/* XXX do we know which pkey matched? Only needed for GSI. */
 	wc.pkey_index = 0;
 	wc.slid = qp->remote_ah_attr.dlid;
 	wc.sl = qp->remote_ah_attr.sl;
 	wc.dlid_path_bits = 0;
+	wc.port_num = 1;
 	/* Signal completion event if the solicited bit is set. */
 	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
 		       wqe->wr.send_flags & IB_SEND_SOLICITED);
 
 send_comp:
 	sqp->s_rnr_retry = sqp->s_rnr_retry_cnt;
-
-	if (!(sqp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
-	    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
-		wc.wr_id = wqe->wr.wr_id;
-		wc.status = IB_WC_SUCCESS;
-		wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
-		wc.vendor_err = 0;
-		wc.byte_len = wqe->length;
-		wc.qp = &sqp->ibqp;
-		wc.src_qp = 0;
-		wc.pkey_index = 0;
-		wc.slid = 0;
-		wc.sl = 0;
-		wc.dlid_path_bits = 0;
-		wc.port_num = 0;
-		ipath_cq_enter(to_icq(sqp->ibqp.send_cq), &wc, 0);
-	}
-
-	/* Update s_last now that we are finished with the SWQE */
-	spin_lock_irqsave(&sqp->s_lock, flags);
-	if (++sqp->s_last >= sqp->s_size)
-		sqp->s_last = 0;
-	spin_unlock_irqrestore(&sqp->s_lock, flags);
+	ipath_send_complete(sqp, wqe, IB_WC_SUCCESS);
 	goto again;
 
 done:
@@ -491,13 +477,11 @@ done:
 		wake_up(&qp->wait);
 }
 
-static int want_buffer(struct ipath_devdata *dd)
+static void want_buffer(struct ipath_devdata *dd)
 {
 	set_bit(IPATH_S_PIOINTBUFAVAIL, &dd->ipath_sendctrl);
 	ipath_write_kreg(dd, dd->ipath_kregs->kr_sendctrl,
 			 dd->ipath_sendctrl);
-
-	return 0;
 }
 
 /**
@@ -507,14 +491,11 @@ static int want_buffer(struct ipath_devdata *dd)
  *
  * Called when we run out of PIO buffers.
  */
-static void ipath_no_bufs_available(struct ipath_qp *qp, struct ipath_ibdev *dev)
+static void ipath_no_bufs_available(struct ipath_qp *qp,
+				    struct ipath_ibdev *dev)
 {
 	unsigned long flags;
 
-	spin_lock_irqsave(&dev->pending_lock, flags);
-	if (list_empty(&qp->piowait))
-		list_add_tail(&qp->piowait, &dev->piowait);
-	spin_unlock_irqrestore(&dev->pending_lock, flags);
 	/*
 	 * Note that as soon as want_buffer() is called and
 	 * possibly before it returns, ipath_ib_piobufavail()
@@ -524,101 +505,14 @@ static void ipath_no_bufs_available(struct ipath_qp *qp, struct ipath_ibdev *dev
 	 * We leave the busy flag set so that another post send doesn't
 	 * try to put the same QP on the piowait list again.
 	 */
+	spin_lock_irqsave(&dev->pending_lock, flags);
+	list_add_tail(&qp->piowait, &dev->piowait);
+	spin_unlock_irqrestore(&dev->pending_lock, flags);
 	want_buffer(dev->dd);
 	dev->n_piowait++;
 }
 
 /**
- * ipath_post_ruc_send - post RC and UC sends
- * @qp: the QP to post on
- * @wr: the work request to send
- */
-int ipath_post_ruc_send(struct ipath_qp *qp, struct ib_send_wr *wr)
-{
-	struct ipath_swqe *wqe;
-	unsigned long flags;
-	u32 next;
-	int i, j;
-	int acc;
-	int ret;
-
-	/*
-	 * Don't allow RDMA reads or atomic operations on UC or
-	 * undefined operations.
-	 * Make sure buffer is large enough to hold the result for atomics.
-	 */
-	if (qp->ibqp.qp_type == IB_QPT_UC) {
-		if ((unsigned) wr->opcode >= IB_WR_RDMA_READ) {
-			ret = -EINVAL;
-			goto bail;
-		}
-	} else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD) {
-		ret = -EINVAL;
-		goto bail;
-	} else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
-		   (wr->num_sge == 0 ||
-		    wr->sg_list[0].length < sizeof(u64) ||
-		    wr->sg_list[0].addr & (sizeof(u64) - 1))) {
-		ret = -EINVAL;
-		goto bail;
-	} else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic) {
-		ret = -EINVAL;
-		goto bail;
-	}
-	/* IB spec says that num_sge == 0 is OK. */
-	if (wr->num_sge > qp->s_max_sge) {
-		ret = -ENOMEM;
-		goto bail;
-	}
-	spin_lock_irqsave(&qp->s_lock, flags);
-	next = qp->s_head + 1;
-	if (next >= qp->s_size)
-		next = 0;
-	if (next == qp->s_last) {
-		spin_unlock_irqrestore(&qp->s_lock, flags);
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	wqe = get_swqe_ptr(qp, qp->s_head);
-	wqe->wr = *wr;
-	wqe->ssn = qp->s_ssn++;
-	wqe->sg_list[0].mr = NULL;
-	wqe->sg_list[0].vaddr = NULL;
-	wqe->sg_list[0].length = 0;
-	wqe->sg_list[0].sge_length = 0;
-	wqe->length = 0;
-	acc = wr->opcode >= IB_WR_RDMA_READ ? IB_ACCESS_LOCAL_WRITE : 0;
-	for (i = 0, j = 0; i < wr->num_sge; i++) {
-		if (to_ipd(qp->ibqp.pd)->user && wr->sg_list[i].lkey == 0) {
-			spin_unlock_irqrestore(&qp->s_lock, flags);
-			ret = -EINVAL;
-			goto bail;
-		}
-		if (wr->sg_list[i].length == 0)
-			continue;
-		if (!ipath_lkey_ok(qp, &wqe->sg_list[j], &wr->sg_list[i],
-				   acc)) {
-			spin_unlock_irqrestore(&qp->s_lock, flags);
-			ret = -EINVAL;
-			goto bail;
-		}
-		wqe->length += wr->sg_list[i].length;
-		j++;
-	}
-	wqe->wr.num_sge = j;
-	qp->s_head = next;
-	spin_unlock_irqrestore(&qp->s_lock, flags);
-
-	ipath_do_ruc_send((unsigned long) qp);
-
-	ret = 0;
-
-bail:
-	return ret;
-}
-
-/**
  * ipath_make_grh - construct a GRH header
  * @dev: a pointer to the ipath device
  * @hdr: a pointer to the GRH header being constructed
@@ -648,39 +542,66 @@ u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr,
 	return sizeof(struct ib_grh) / sizeof(u32);
 }
 
+void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp,
+			   struct ipath_other_headers *ohdr,
+			   u32 bth0, u32 bth2)
+{
+	u16 lrh0;
+	u32 nwords;
+	u32 extra_bytes;
+
+	/* Construct the header. */
+	extra_bytes = -qp->s_cur_size & 3;
+	nwords = (qp->s_cur_size + extra_bytes) >> 2;
+	lrh0 = IPATH_LRH_BTH;
+	if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
+		qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh,
+						 &qp->remote_ah_attr.grh,
+						 qp->s_hdrwords, nwords);
+		lrh0 = IPATH_LRH_GRH;
+	}
+	lrh0 |= qp->remote_ah_attr.sl << 4;
+	qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
+	qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
+	qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords + SIZE_OF_CRC);
+	qp->s_hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid);
+	bth0 |= ipath_get_pkey(dev->dd, qp->s_pkey_index);
+	bth0 |= extra_bytes << 20;
+	ohdr->bth[0] = cpu_to_be32(bth0 | (1 << 22));
+	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
+	ohdr->bth[2] = cpu_to_be32(bth2);
+}
+
 /**
- * ipath_do_ruc_send - perform a send on an RC or UC QP
+ * ipath_do_send - perform a send on a QP
  * @data: contains a pointer to the QP
  *
  * Process entries in the send work queue until credit or queue is
  * exhausted.  Only allow one CPU to send a packet per QP (tasklet).
- * Otherwise, after we drop the QP s_lock, two threads could send
- * packets out of order.
+ * Otherwise, two threads could send packets out of order.
  */
-void ipath_do_ruc_send(unsigned long data)
+void ipath_do_send(unsigned long data)
 {
 	struct ipath_qp *qp = (struct ipath_qp *)data;
 	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
-	unsigned long flags;
-	u16 lrh0;
-	u32 nwords;
-	u32 extra_bytes;
-	u32 bth0;
-	u32 bth2;
-	u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
-	struct ipath_other_headers *ohdr;
+	int (*make_req)(struct ipath_qp *qp);
 
 	if (test_and_set_bit(IPATH_S_BUSY, &qp->s_busy))
 		goto bail;
 
-	if (unlikely(qp->remote_ah_attr.dlid == dev->dd->ipath_lid)) {
+	if ((qp->ibqp.qp_type == IB_QPT_RC ||
+	     qp->ibqp.qp_type == IB_QPT_UC) &&
+	    qp->remote_ah_attr.dlid == dev->dd->ipath_lid) {
 		ipath_ruc_loopback(qp);
 		goto clear;
 	}
 
-	ohdr = &qp->s_hdr.u.oth;
-	if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
-		ohdr = &qp->s_hdr.u.l.oth;
+	if (qp->ibqp.qp_type == IB_QPT_RC)
+	       make_req = ipath_make_rc_req;
+	else if (qp->ibqp.qp_type == IB_QPT_UC)
+	       make_req = ipath_make_uc_req;
+	else
+	       make_req = ipath_make_ud_req;
 
 again:
 	/* Check for a constructed packet to be sent. */
@@ -689,9 +610,8 @@ again:
 		 * If no PIO bufs are available, return.  An interrupt will
 		 * call ipath_ib_piobufavail() when one is available.
 		 */
-		if (ipath_verbs_send(dev->dd, qp->s_hdrwords,
-				     (u32 *) &qp->s_hdr, qp->s_cur_size,
-				     qp->s_cur_sge)) {
+		if (ipath_verbs_send(qp, &qp->s_hdr, qp->s_hdrwords,
+				     qp->s_cur_sge, qp->s_cur_size)) {
 			ipath_no_bufs_available(qp, dev);
 			goto bail;
 		}
@@ -700,54 +620,42 @@ again:
 		qp->s_hdrwords = 0;
 	}
 
-	/*
-	 * The lock is needed to synchronize between setting
-	 * qp->s_ack_state, resend timer, and post_send().
-	 */
-	spin_lock_irqsave(&qp->s_lock, flags);
-
-	if (!((qp->ibqp.qp_type == IB_QPT_RC) ?
-	       ipath_make_rc_req(qp, ohdr, pmtu, &bth0, &bth2) :
-	       ipath_make_uc_req(qp, ohdr, pmtu, &bth0, &bth2))) {
-		/*
-		 * Clear the busy bit before unlocking to avoid races with
-		 * adding new work queue items and then failing to process
-		 * them.
-		 */
-		clear_bit(IPATH_S_BUSY, &qp->s_busy);
-		spin_unlock_irqrestore(&qp->s_lock, flags);
-		goto bail;
-	}
+	if (make_req(qp))
+		goto again;
+clear:
+	clear_bit(IPATH_S_BUSY, &qp->s_busy);
+bail:;
+}
 
-	spin_unlock_irqrestore(&qp->s_lock, flags);
+void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
+			 enum ib_wc_status status)
+{
+	u32 last = qp->s_last;
 
-	/* Construct the header. */
-	extra_bytes = (4 - qp->s_cur_size) & 3;
-	nwords = (qp->s_cur_size + extra_bytes) >> 2;
-	lrh0 = IPATH_LRH_BTH;
-	if (unlikely(qp->remote_ah_attr.ah_flags & IB_AH_GRH)) {
-		qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh,
-						 &qp->remote_ah_attr.grh,
-						 qp->s_hdrwords, nwords);
-		lrh0 = IPATH_LRH_GRH;
-	}
-	lrh0 |= qp->remote_ah_attr.sl << 4;
-	qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
-	qp->s_hdr.lrh[1] = cpu_to_be16(qp->remote_ah_attr.dlid);
-	qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords +
-				       SIZE_OF_CRC);
-	qp->s_hdr.lrh[3] = cpu_to_be16(dev->dd->ipath_lid);
-	bth0 |= ipath_get_pkey(dev->dd, qp->s_pkey_index);
-	bth0 |= extra_bytes << 20;
-	ohdr->bth[0] = cpu_to_be32(bth0);
-	ohdr->bth[1] = cpu_to_be32(qp->remote_qpn);
-	ohdr->bth[2] = cpu_to_be32(bth2);
+	if (++last == qp->s_size)
+		last = 0;
+	qp->s_last = last;
 
-	/* Check for more work to do. */
-	goto again;
+	/* See ch. 11.2.4.1 and 10.7.3.1 */
+	if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
+	    (wqe->wr.send_flags & IB_SEND_SIGNALED) ||
+	    status != IB_WC_SUCCESS) {
+		struct ib_wc wc;
 
-clear:
-	clear_bit(IPATH_S_BUSY, &qp->s_busy);
-bail:
-	return;
+		wc.wr_id = wqe->wr.wr_id;
+		wc.status = status;
+		wc.opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
+		wc.vendor_err = 0;
+		wc.byte_len = wqe->length;
+		wc.imm_data = 0;
+		wc.qp = &qp->ibqp;
+		wc.src_qp = 0;
+		wc.wc_flags = 0;
+		wc.pkey_index = 0;
+		wc.slid = 0;
+		wc.sl = 0;
+		wc.dlid_path_bits = 0;
+		wc.port_num = 0;
+		ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
+	}
 }
diff --git a/drivers/infiniband/hw/ipath/ipath_stats.c b/drivers/infiniband/hw/ipath/ipath_stats.c
index bae4f56f7271..f0271415cd5b 100644
--- a/drivers/infiniband/hw/ipath/ipath_stats.c
+++ b/drivers/infiniband/hw/ipath/ipath_stats.c
@@ -55,7 +55,6 @@ u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg)
 	u64 val64;
 	unsigned long t0, t1;
 	u64 ret;
-	unsigned long flags;
 
 	t0 = jiffies;
 	/* If fast increment counters are only 32 bits, snapshot them,
@@ -92,18 +91,12 @@ u64 ipath_snap_cntr(struct ipath_devdata *dd, ipath_creg creg)
 	if (creg == dd->ipath_cregs->cr_wordsendcnt) {
 		if (val != dd->ipath_lastsword) {
 			dd->ipath_sword += val - dd->ipath_lastsword;
-			spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
-			dd->ipath_traffic_wds += val - dd->ipath_lastsword;
-			spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
 			dd->ipath_lastsword = val;
 		}
 		val64 = dd->ipath_sword;
 	} else if (creg == dd->ipath_cregs->cr_wordrcvcnt) {
 		if (val != dd->ipath_lastrword) {
 			dd->ipath_rword += val - dd->ipath_lastrword;
-			spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
-			dd->ipath_traffic_wds += val - dd->ipath_lastrword;
-			spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
 			dd->ipath_lastrword = val;
 		}
 		val64 = dd->ipath_rword;
@@ -247,6 +240,7 @@ void ipath_get_faststats(unsigned long opaque)
 	u32 val;
 	static unsigned cnt;
 	unsigned long flags;
+	u64 traffic_wds;
 
 	/*
 	 * don't access the chip while running diags, or memory diags can
@@ -262,12 +256,13 @@ void ipath_get_faststats(unsigned long opaque)
 	 * exceeding a threshold, so we need to check the word-counts
 	 * even if they are 64-bit.
 	 */
-	ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt);
-	ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
+	traffic_wds = ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt) +
+		ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
 	spin_lock_irqsave(&dd->ipath_eep_st_lock, flags);
-	if (dd->ipath_traffic_wds  >= IPATH_TRAFFIC_ACTIVE_THRESHOLD)
+	traffic_wds -= dd->ipath_traffic_wds;
+	dd->ipath_traffic_wds += traffic_wds;
+	if (traffic_wds  >= IPATH_TRAFFIC_ACTIVE_THRESHOLD)
 		atomic_add(5, &dd->ipath_active_time); /* S/B #define */
-	dd->ipath_traffic_wds = 0;
 	spin_unlock_irqrestore(&dd->ipath_eep_st_lock, flags);
 
 	if (dd->ipath_flags & IPATH_32BITCOUNTERS) {
diff --git a/drivers/infiniband/hw/ipath/ipath_sysfs.c b/drivers/infiniband/hw/ipath/ipath_sysfs.c
index 16238cd3a036..e1ad7cfc21fd 100644
--- a/drivers/infiniband/hw/ipath/ipath_sysfs.c
+++ b/drivers/infiniband/hw/ipath/ipath_sysfs.c
@@ -163,6 +163,42 @@ static ssize_t show_boardversion(struct device *dev,
 	return scnprintf(buf, PAGE_SIZE, "%s", dd->ipath_boardversion);
 }
 
+static ssize_t show_lmc(struct device *dev,
+			struct device_attribute *attr,
+			char *buf)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+
+	return scnprintf(buf, PAGE_SIZE, "%u\n", dd->ipath_lmc);
+}
+
+static ssize_t store_lmc(struct device *dev,
+			 struct device_attribute *attr,
+			 const char *buf,
+			 size_t count)
+{
+	struct ipath_devdata *dd = dev_get_drvdata(dev);
+	u16 lmc = 0;
+	int ret;
+
+	ret = ipath_parse_ushort(buf, &lmc);
+	if (ret < 0)
+		goto invalid;
+
+	if (lmc > 7) {
+		ret = -EINVAL;
+		goto invalid;
+	}
+
+	ipath_set_lid(dd, dd->ipath_lid, lmc);
+
+	goto bail;
+invalid:
+	ipath_dev_err(dd, "attempt to set invalid LMC %u\n", lmc);
+bail:
+	return ret;
+}
+
 static ssize_t show_lid(struct device *dev,
 			struct device_attribute *attr,
 			char *buf)
@@ -190,7 +226,7 @@ static ssize_t store_lid(struct device *dev,
 		goto invalid;
 	}
 
-	ipath_set_lid(dd, lid, 0);
+	ipath_set_lid(dd, lid, dd->ipath_lmc);
 
 	goto bail;
 invalid:
@@ -648,6 +684,7 @@ static struct attribute_group driver_attr_group = {
 };
 
 static DEVICE_ATTR(guid, S_IWUSR | S_IRUGO, show_guid, store_guid);
+static DEVICE_ATTR(lmc, S_IWUSR | S_IRUGO, show_lmc, store_lmc);
 static DEVICE_ATTR(lid, S_IWUSR | S_IRUGO, show_lid, store_lid);
 static DEVICE_ATTR(link_state, S_IWUSR, NULL, store_link_state);
 static DEVICE_ATTR(mlid, S_IWUSR | S_IRUGO, show_mlid, store_mlid);
@@ -667,6 +704,7 @@ static DEVICE_ATTR(logged_errors, S_IRUGO, show_logged_errs, NULL);
 
 static struct attribute *dev_attributes[] = {
 	&dev_attr_guid.attr,
+	&dev_attr_lmc.attr,
 	&dev_attr_lid.attr,
 	&dev_attr_link_state.attr,
 	&dev_attr_mlid.attr,
diff --git a/drivers/infiniband/hw/ipath/ipath_uc.c b/drivers/infiniband/hw/ipath/ipath_uc.c
index 8380fbc50d2c..2dd8de20d221 100644
--- a/drivers/infiniband/hw/ipath/ipath_uc.c
+++ b/drivers/infiniband/hw/ipath/ipath_uc.c
@@ -37,72 +37,40 @@
 /* cut down ridiculously long IB macro names */
 #define OP(x) IB_OPCODE_UC_##x
 
-static void complete_last_send(struct ipath_qp *qp, struct ipath_swqe *wqe,
-			       struct ib_wc *wc)
-{
-	if (++qp->s_last == qp->s_size)
-		qp->s_last = 0;
-	if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
-	    (wqe->wr.send_flags & IB_SEND_SIGNALED)) {
-		wc->wr_id = wqe->wr.wr_id;
-		wc->status = IB_WC_SUCCESS;
-		wc->opcode = ib_ipath_wc_opcode[wqe->wr.opcode];
-		wc->vendor_err = 0;
-		wc->byte_len = wqe->length;
-		wc->qp = &qp->ibqp;
-		wc->src_qp = qp->remote_qpn;
-		wc->pkey_index = 0;
-		wc->slid = qp->remote_ah_attr.dlid;
-		wc->sl = qp->remote_ah_attr.sl;
-		wc->dlid_path_bits = 0;
-		wc->port_num = 0;
-		ipath_cq_enter(to_icq(qp->ibqp.send_cq), wc, 0);
-	}
-}
-
 /**
  * ipath_make_uc_req - construct a request packet (SEND, RDMA write)
  * @qp: a pointer to the QP
- * @ohdr: a pointer to the IB header being constructed
- * @pmtu: the path MTU
- * @bth0p: pointer to the BTH opcode word
- * @bth2p: pointer to the BTH PSN word
  *
  * Return 1 if constructed; otherwise, return 0.
- * Note the QP s_lock must be held and interrupts disabled.
  */
-int ipath_make_uc_req(struct ipath_qp *qp,
-		      struct ipath_other_headers *ohdr,
-		      u32 pmtu, u32 *bth0p, u32 *bth2p)
+int ipath_make_uc_req(struct ipath_qp *qp)
 {
+	struct ipath_other_headers *ohdr;
 	struct ipath_swqe *wqe;
 	u32 hwords;
 	u32 bth0;
 	u32 len;
-	struct ib_wc wc;
+	u32 pmtu = ib_mtu_enum_to_int(qp->path_mtu);
+	int ret = 0;
 
 	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK))
 		goto done;
 
+	ohdr = &qp->s_hdr.u.oth;
+	if (qp->remote_ah_attr.ah_flags & IB_AH_GRH)
+		ohdr = &qp->s_hdr.u.l.oth;
+
 	/* header size in 32-bit words LRH+BTH = (8+12)/4. */
 	hwords = 5;
 	bth0 = 1 << 22; /* Set M bit */
 
 	/* Get the next send request. */
-	wqe = get_swqe_ptr(qp, qp->s_last);
+	wqe = get_swqe_ptr(qp, qp->s_cur);
+	qp->s_wqe = NULL;
 	switch (qp->s_state) {
 	default:
-		/*
-		 * Signal the completion of the last send
-		 * (if there is one).
-		 */
-		if (qp->s_last != qp->s_tail) {
-			complete_last_send(qp, wqe, &wc);
-			wqe = get_swqe_ptr(qp, qp->s_last);
-		}
-
 		/* Check if send work queue is empty. */
-		if (qp->s_tail == qp->s_head)
+		if (qp->s_cur == qp->s_head)
 			goto done;
 		/*
 		 * Start a new request.
@@ -131,6 +99,9 @@ int ipath_make_uc_req(struct ipath_qp *qp,
 			}
 			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
 				bth0 |= 1 << 23;
+			qp->s_wqe = wqe;
+			if (++qp->s_cur >= qp->s_size)
+				qp->s_cur = 0;
 			break;
 
 		case IB_WR_RDMA_WRITE:
@@ -157,13 +128,14 @@ int ipath_make_uc_req(struct ipath_qp *qp,
 				if (wqe->wr.send_flags & IB_SEND_SOLICITED)
 					bth0 |= 1 << 23;
 			}
+			qp->s_wqe = wqe;
+			if (++qp->s_cur >= qp->s_size)
+				qp->s_cur = 0;
 			break;
 
 		default:
 			goto done;
 		}
-		if (++qp->s_tail >= qp->s_size)
-			qp->s_tail = 0;
 		break;
 
 	case OP(SEND_FIRST):
@@ -185,6 +157,9 @@ int ipath_make_uc_req(struct ipath_qp *qp,
 		}
 		if (wqe->wr.send_flags & IB_SEND_SOLICITED)
 			bth0 |= 1 << 23;
+		qp->s_wqe = wqe;
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
 		break;
 
 	case OP(RDMA_WRITE_FIRST):
@@ -207,18 +182,22 @@ int ipath_make_uc_req(struct ipath_qp *qp,
 			if (wqe->wr.send_flags & IB_SEND_SOLICITED)
 				bth0 |= 1 << 23;
 		}
+		qp->s_wqe = wqe;
+		if (++qp->s_cur >= qp->s_size)
+			qp->s_cur = 0;
 		break;
 	}
 	qp->s_len -= len;
 	qp->s_hdrwords = hwords;
 	qp->s_cur_sge = &qp->s_sge;
 	qp->s_cur_size = len;
-	*bth0p = bth0 | (qp->s_state << 24);
-	*bth2p = qp->s_next_psn++ & IPATH_PSN_MASK;
-	return 1;
+	ipath_make_ruc_header(to_idev(qp->ibqp.device),
+			      qp, ohdr, bth0 | (qp->s_state << 24),
+			      qp->s_next_psn++ & IPATH_PSN_MASK);
+	ret = 1;
 
 done:
-	return 0;
+	return ret;
 }
 
 /**
@@ -485,6 +464,16 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 
 	case OP(RDMA_WRITE_LAST_WITH_IMMEDIATE):
 	rdma_last_imm:
+		if (header_in_data) {
+			wc.imm_data = *(__be32 *) data;
+			data += sizeof(__be32);
+		} else {
+			/* Immediate data comes after BTH */
+			wc.imm_data = ohdr->u.imm_data;
+		}
+		hdrsize += 4;
+		wc.wc_flags = IB_WC_WITH_IMM;
+
 		/* Get the number of bytes the message was padded by. */
 		pad = (be32_to_cpu(ohdr->bth[0]) >> 20) & 3;
 		/* Check for invalid length. */
@@ -505,16 +494,7 @@ void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 			dev->n_pkt_drops++;
 			goto done;
 		}
-		if (header_in_data) {
-			wc.imm_data = *(__be32 *) data;
-			data += sizeof(__be32);
-		} else {
-			/* Immediate data comes after BTH */
-			wc.imm_data = ohdr->u.imm_data;
-		}
-		hdrsize += 4;
-		wc.wc_flags = IB_WC_WITH_IMM;
-		wc.byte_len = 0;
+		wc.byte_len = qp->r_len;
 		goto last_imm;
 
 	case OP(RDMA_WRITE_LAST):
diff --git a/drivers/infiniband/hw/ipath/ipath_ud.c b/drivers/infiniband/hw/ipath/ipath_ud.c
index f9a3338a5fb7..16a2a938b520 100644
--- a/drivers/infiniband/hw/ipath/ipath_ud.c
+++ b/drivers/infiniband/hw/ipath/ipath_ud.c
@@ -36,68 +36,17 @@
 #include "ipath_verbs.h"
 #include "ipath_kernel.h"
 
-static int init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
-		    u32 *lengthp, struct ipath_sge_state *ss)
-{
-	int user = to_ipd(qp->ibqp.pd)->user;
-	int i, j, ret;
-	struct ib_wc wc;
-
-	*lengthp = 0;
-	for (i = j = 0; i < wqe->num_sge; i++) {
-		if (wqe->sg_list[i].length == 0)
-			continue;
-		/* Check LKEY */
-		if ((user && wqe->sg_list[i].lkey == 0) ||
-		    !ipath_lkey_ok(qp, j ? &ss->sg_list[j - 1] : &ss->sge,
-				   &wqe->sg_list[i], IB_ACCESS_LOCAL_WRITE))
-			goto bad_lkey;
-		*lengthp += wqe->sg_list[i].length;
-		j++;
-	}
-	ss->num_sge = j;
-	ret = 1;
-	goto bail;
-
-bad_lkey:
-	wc.wr_id = wqe->wr_id;
-	wc.status = IB_WC_LOC_PROT_ERR;
-	wc.opcode = IB_WC_RECV;
-	wc.vendor_err = 0;
-	wc.byte_len = 0;
-	wc.imm_data = 0;
-	wc.qp = &qp->ibqp;
-	wc.src_qp = 0;
-	wc.wc_flags = 0;
-	wc.pkey_index = 0;
-	wc.slid = 0;
-	wc.sl = 0;
-	wc.dlid_path_bits = 0;
-	wc.port_num = 0;
-	/* Signal solicited completion event. */
-	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc, 1);
-	ret = 0;
-bail:
-	return ret;
-}
-
 /**
  * ipath_ud_loopback - handle send on loopback QPs
- * @sqp: the QP
- * @ss: the SGE state
- * @length: the length of the data to send
- * @wr: the work request
- * @wc: the work completion entry
+ * @sqp: the sending QP
+ * @swqe: the send work request
  *
- * This is called from ipath_post_ud_send() to forward a WQE addressed
+ * This is called from ipath_make_ud_req() to forward a WQE addressed
  * to the same HCA.
  * Note that the receive interrupt handler may be calling ipath_ud_rcv()
  * while this is being called.
  */
-static void ipath_ud_loopback(struct ipath_qp *sqp,
-			      struct ipath_sge_state *ss,
-			      u32 length, struct ib_send_wr *wr,
-			      struct ib_wc *wc)
+static void ipath_ud_loopback(struct ipath_qp *sqp, struct ipath_swqe *swqe)
 {
 	struct ipath_ibdev *dev = to_idev(sqp->ibqp.device);
 	struct ipath_qp *qp;
@@ -110,12 +59,18 @@ static void ipath_ud_loopback(struct ipath_qp *sqp,
 	struct ipath_rwq *wq;
 	struct ipath_rwqe *wqe;
 	void (*handler)(struct ib_event *, void *);
+	struct ib_wc wc;
 	u32 tail;
 	u32 rlen;
+	u32 length;
 
-	qp = ipath_lookup_qpn(&dev->qp_table, wr->wr.ud.remote_qpn);
-	if (!qp)
-		return;
+	qp = ipath_lookup_qpn(&dev->qp_table, swqe->wr.wr.ud.remote_qpn);
+	if (!qp) {
+		dev->n_pkt_drops++;
+		goto send_comp;
+	}
+
+	rsge.sg_list = NULL;
 
 	/*
 	 * Check that the qkey matches (except for QP0, see 9.6.1.4.1).
@@ -123,39 +78,34 @@ static void ipath_ud_loopback(struct ipath_qp *sqp,
 	 * qkey from the QP context instead of the WR (see 10.2.5).
 	 */
 	if (unlikely(qp->ibqp.qp_num &&
-		     ((int) wr->wr.ud.remote_qkey < 0
-		      ? qp->qkey : wr->wr.ud.remote_qkey) != qp->qkey)) {
+		     ((int) swqe->wr.wr.ud.remote_qkey < 0 ?
+		      sqp->qkey : swqe->wr.wr.ud.remote_qkey) != qp->qkey)) {
 		/* XXX OK to lose a count once in a while. */
 		dev->qkey_violations++;
 		dev->n_pkt_drops++;
-		goto done;
+		goto drop;
 	}
 
 	/*
 	 * A GRH is expected to preceed the data even if not
 	 * present on the wire.
 	 */
-	wc->byte_len = length + sizeof(struct ib_grh);
+	length = swqe->length;
+	wc.byte_len = length + sizeof(struct ib_grh);
 
-	if (wr->opcode == IB_WR_SEND_WITH_IMM) {
-		wc->wc_flags = IB_WC_WITH_IMM;
-		wc->imm_data = wr->imm_data;
+	if (swqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+		wc.wc_flags = IB_WC_WITH_IMM;
+		wc.imm_data = swqe->wr.imm_data;
 	} else {
-		wc->wc_flags = 0;
-		wc->imm_data = 0;
+		wc.wc_flags = 0;
+		wc.imm_data = 0;
 	}
 
-	if (wr->num_sge > 1) {
-		rsge.sg_list = kmalloc((wr->num_sge - 1) *
-					sizeof(struct ipath_sge),
-				       GFP_ATOMIC);
-	} else
-		rsge.sg_list = NULL;
-
 	/*
-	 * Get the next work request entry to find where to put the data.
-	 * Note that it is safe to drop the lock after changing rq->tail
-	 * since ipath_post_receive() won't fill the empty slot.
+	 * This would be a lot simpler if we could call ipath_get_rwqe()
+	 * but that uses state that the receive interrupt handler uses
+	 * so we would need to lock out receive interrupts while doing
+	 * local loopback.
 	 */
 	if (qp->ibqp.srq) {
 		srq = to_isrq(qp->ibqp.srq);
@@ -167,32 +117,53 @@ static void ipath_ud_loopback(struct ipath_qp *sqp,
 		rq = &qp->r_rq;
 	}
 
+	if (rq->max_sge > 1) {
+		/*
+		 * XXX We could use GFP_KERNEL if ipath_do_send()
+		 * was always called from the tasklet instead of
+		 * from ipath_post_send().
+		 */
+		rsge.sg_list = kmalloc((rq->max_sge - 1) *
+					sizeof(struct ipath_sge),
+				       GFP_ATOMIC);
+		if (!rsge.sg_list) {
+			dev->n_pkt_drops++;
+			goto drop;
+		}
+	}
+
+	/*
+	 * Get the next work request entry to find where to put the data.
+	 * Note that it is safe to drop the lock after changing rq->tail
+	 * since ipath_post_receive() won't fill the empty slot.
+	 */
 	spin_lock_irqsave(&rq->lock, flags);
 	wq = rq->wq;
 	tail = wq->tail;
-	while (1) {
-		if (unlikely(tail == wq->head)) {
-			spin_unlock_irqrestore(&rq->lock, flags);
-			dev->n_pkt_drops++;
-			goto bail_sge;
-		}
-		/* Make sure entry is read after head index is read. */
-		smp_rmb();
-		wqe = get_rwqe_ptr(rq, tail);
-		if (++tail >= rq->size)
-			tail = 0;
-		if (init_sge(qp, wqe, &rlen, &rsge))
-			break;
-		wq->tail = tail;
+	/* Validate tail before using it since it is user writable. */
+	if (tail >= rq->size)
+		tail = 0;
+	if (unlikely(tail == wq->head)) {
+		spin_unlock_irqrestore(&rq->lock, flags);
+		dev->n_pkt_drops++;
+		goto drop;
+	}
+	wqe = get_rwqe_ptr(rq, tail);
+	if (!ipath_init_sge(qp, wqe, &rlen, &rsge)) {
+		spin_unlock_irqrestore(&rq->lock, flags);
+		dev->n_pkt_drops++;
+		goto drop;
 	}
 	/* Silently drop packets which are too big. */
-	if (wc->byte_len > rlen) {
+	if (wc.byte_len > rlen) {
 		spin_unlock_irqrestore(&rq->lock, flags);
 		dev->n_pkt_drops++;
-		goto bail_sge;
+		goto drop;
 	}
+	if (++tail >= rq->size)
+		tail = 0;
 	wq->tail = tail;
-	wc->wr_id = wqe->wr_id;
+	wc.wr_id = wqe->wr_id;
 	if (handler) {
 		u32 n;
 
@@ -221,13 +192,13 @@ static void ipath_ud_loopback(struct ipath_qp *sqp,
 	} else
 		spin_unlock_irqrestore(&rq->lock, flags);
 
-	ah_attr = &to_iah(wr->wr.ud.ah)->attr;
+	ah_attr = &to_iah(swqe->wr.wr.ud.ah)->attr;
 	if (ah_attr->ah_flags & IB_AH_GRH) {
 		ipath_copy_sge(&rsge, &ah_attr->grh, sizeof(struct ib_grh));
-		wc->wc_flags |= IB_WC_GRH;
+		wc.wc_flags |= IB_WC_GRH;
 	} else
 		ipath_skip_sge(&rsge, sizeof(struct ib_grh));
-	sge = &ss->sge;
+	sge = swqe->sg_list;
 	while (length) {
 		u32 len = sge->length;
 
@@ -241,8 +212,8 @@ static void ipath_ud_loopback(struct ipath_qp *sqp,
 		sge->length -= len;
 		sge->sge_length -= len;
 		if (sge->sge_length == 0) {
-			if (--ss->num_sge)
-				*sge = *ss->sg_list++;
+			if (--swqe->wr.num_sge)
+				sge++;
 		} else if (sge->length == 0 && sge->mr != NULL) {
 			if (++sge->n >= IPATH_SEGSZ) {
 				if (++sge->m >= sge->mr->mapsz)
@@ -256,123 +227,60 @@ static void ipath_ud_loopback(struct ipath_qp *sqp,
 		}
 		length -= len;
 	}
-	wc->status = IB_WC_SUCCESS;
-	wc->opcode = IB_WC_RECV;
-	wc->vendor_err = 0;
-	wc->qp = &qp->ibqp;
-	wc->src_qp = sqp->ibqp.qp_num;
+	wc.status = IB_WC_SUCCESS;
+	wc.opcode = IB_WC_RECV;
+	wc.vendor_err = 0;
+	wc.qp = &qp->ibqp;
+	wc.src_qp = sqp->ibqp.qp_num;
 	/* XXX do we know which pkey matched? Only needed for GSI. */
-	wc->pkey_index = 0;
-	wc->slid = dev->dd->ipath_lid |
+	wc.pkey_index = 0;
+	wc.slid = dev->dd->ipath_lid |
 		(ah_attr->src_path_bits &
-		 ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1));
-	wc->sl = ah_attr->sl;
-	wc->dlid_path_bits =
-		ah_attr->dlid & ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1);
+		 ((1 << dev->dd->ipath_lmc) - 1));
+	wc.sl = ah_attr->sl;
+	wc.dlid_path_bits =
+		ah_attr->dlid & ((1 << dev->dd->ipath_lmc) - 1);
+	wc.port_num = 1;
 	/* Signal completion event if the solicited bit is set. */
-	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), wc,
-		       wr->send_flags & IB_SEND_SOLICITED);
-
-bail_sge:
+	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
+		       swqe->wr.send_flags & IB_SEND_SOLICITED);
+drop:
 	kfree(rsge.sg_list);
-done:
 	if (atomic_dec_and_test(&qp->refcount))
 		wake_up(&qp->wait);
+send_comp:
+	ipath_send_complete(sqp, swqe, IB_WC_SUCCESS);
 }
 
 /**
- * ipath_post_ud_send - post a UD send on QP
+ * ipath_make_ud_req - construct a UD request packet
  * @qp: the QP
- * @wr: the work request
  *
- * Note that we actually send the data as it is posted instead of putting
- * the request into a ring buffer.  If we wanted to use a ring buffer,
- * we would need to save a reference to the destination address in the SWQE.
+ * Return 1 if constructed; otherwise, return 0.
  */
-int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr)
+int ipath_make_ud_req(struct ipath_qp *qp)
 {
 	struct ipath_ibdev *dev = to_idev(qp->ibqp.device);
 	struct ipath_other_headers *ohdr;
 	struct ib_ah_attr *ah_attr;
-	struct ipath_sge_state ss;
-	struct ipath_sge *sg_list;
-	struct ib_wc wc;
-	u32 hwords;
+	struct ipath_swqe *wqe;
 	u32 nwords;
-	u32 len;
 	u32 extra_bytes;
 	u32 bth0;
 	u16 lrh0;
 	u16 lid;
-	int i;
-	int ret;
+	int ret = 0;
 
-	if (!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)) {
-		ret = 0;
+	if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_PROCESS_SEND_OK)))
 		goto bail;
-	}
 
-	if (wr->wr.ud.ah->pd != qp->ibqp.pd) {
-		ret = -EPERM;
+	if (qp->s_cur == qp->s_head)
 		goto bail;
-	}
 
-	/* IB spec says that num_sge == 0 is OK. */
-	if (wr->num_sge > qp->s_max_sge) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	if (wr->num_sge > 1) {
-		sg_list = kmalloc((qp->s_max_sge - 1) * sizeof(*sg_list),
-				  GFP_ATOMIC);
-		if (!sg_list) {
-			ret = -ENOMEM;
-			goto bail;
-		}
-	} else
-		sg_list = NULL;
-
-	/* Check the buffer to send. */
-	ss.sg_list = sg_list;
-	ss.sge.mr = NULL;
-	ss.sge.vaddr = NULL;
-	ss.sge.length = 0;
-	ss.sge.sge_length = 0;
-	ss.num_sge = 0;
-	len = 0;
-	for (i = 0; i < wr->num_sge; i++) {
-		/* Check LKEY */
-		if (to_ipd(qp->ibqp.pd)->user && wr->sg_list[i].lkey == 0) {
-			ret = -EINVAL;
-			goto bail;
-		}
-
-		if (wr->sg_list[i].length == 0)
-			continue;
-		if (!ipath_lkey_ok(qp, ss.num_sge ?
-				   sg_list + ss.num_sge - 1 : &ss.sge,
-				   &wr->sg_list[i], 0)) {
-			ret = -EINVAL;
-			goto bail;
-		}
-		len += wr->sg_list[i].length;
-		ss.num_sge++;
-	}
-	/* Check for invalid packet size. */
-	if (len > dev->dd->ipath_ibmtu) {
-		ret = -EINVAL;
-		goto bail;
-	}
-	extra_bytes = (4 - len) & 3;
-	nwords = (len + extra_bytes) >> 2;
+	wqe = get_swqe_ptr(qp, qp->s_cur);
 
 	/* Construct the header. */
-	ah_attr = &to_iah(wr->wr.ud.ah)->attr;
-	if (ah_attr->dlid == 0) {
-		ret = -EINVAL;
-		goto bail;
-	}
+	ah_attr = &to_iah(wqe->wr.wr.ud.ah)->attr;
 	if (ah_attr->dlid >= IPATH_MULTICAST_LID_BASE) {
 		if (ah_attr->dlid != IPATH_PERMISSIVE_LID)
 			dev->n_multicast_xmit++;
@@ -381,74 +289,63 @@ int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr)
 	} else {
 		dev->n_unicast_xmit++;
 		lid = ah_attr->dlid &
-			~((1 << (dev->mkeyprot_resv_lmc & 7)) - 1);
+			~((1 << dev->dd->ipath_lmc) - 1);
 		if (unlikely(lid == dev->dd->ipath_lid)) {
-			/*
-			 * Pass in an uninitialized ib_wc to save stack
-			 * space.
-			 */
-			ipath_ud_loopback(qp, &ss, len, wr, &wc);
+			ipath_ud_loopback(qp, wqe);
 			goto done;
 		}
 	}
+
+	extra_bytes = -wqe->length & 3;
+	nwords = (wqe->length + extra_bytes) >> 2;
+
+	/* header size in 32-bit words LRH+BTH+DETH = (8+12+8)/4. */
+	qp->s_hdrwords = 7;
+	if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM)
+		qp->s_hdrwords++;
+	qp->s_cur_size = wqe->length;
+	qp->s_cur_sge = &qp->s_sge;
+	qp->s_wqe = wqe;
+	qp->s_sge.sge = wqe->sg_list[0];
+	qp->s_sge.sg_list = wqe->sg_list + 1;
+	qp->s_sge.num_sge = wqe->wr.num_sge;
+
 	if (ah_attr->ah_flags & IB_AH_GRH) {
 		/* Header size in 32-bit words. */
-		hwords = 17;
+		qp->s_hdrwords += ipath_make_grh(dev, &qp->s_hdr.u.l.grh,
+						 &ah_attr->grh,
+						 qp->s_hdrwords, nwords);
 		lrh0 = IPATH_LRH_GRH;
 		ohdr = &qp->s_hdr.u.l.oth;
-		qp->s_hdr.u.l.grh.version_tclass_flow =
-			cpu_to_be32((6 << 28) |
-				    (ah_attr->grh.traffic_class << 20) |
-				    ah_attr->grh.flow_label);
-		qp->s_hdr.u.l.grh.paylen =
-			cpu_to_be16(((wr->opcode ==
-				      IB_WR_SEND_WITH_IMM ? 6 : 5) +
-				     nwords + SIZE_OF_CRC) << 2);
-		/* next_hdr is defined by C8-7 in ch. 8.4.1 */
-		qp->s_hdr.u.l.grh.next_hdr = 0x1B;
-		qp->s_hdr.u.l.grh.hop_limit = ah_attr->grh.hop_limit;
-		/* The SGID is 32-bit aligned. */
-		qp->s_hdr.u.l.grh.sgid.global.subnet_prefix =
-			dev->gid_prefix;
-		qp->s_hdr.u.l.grh.sgid.global.interface_id =
-			dev->dd->ipath_guid;
-		qp->s_hdr.u.l.grh.dgid = ah_attr->grh.dgid;
 		/*
 		 * Don't worry about sending to locally attached multicast
 		 * QPs.  It is unspecified by the spec. what happens.
 		 */
 	} else {
 		/* Header size in 32-bit words. */
-		hwords = 7;
 		lrh0 = IPATH_LRH_BTH;
 		ohdr = &qp->s_hdr.u.oth;
 	}
-	if (wr->opcode == IB_WR_SEND_WITH_IMM) {
-		ohdr->u.ud.imm_data = wr->imm_data;
-		wc.imm_data = wr->imm_data;
-		hwords += 1;
+	if (wqe->wr.opcode == IB_WR_SEND_WITH_IMM) {
+		ohdr->u.ud.imm_data = wqe->wr.imm_data;
 		bth0 = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE << 24;
-	} else if (wr->opcode == IB_WR_SEND) {
-		wc.imm_data = 0;
+	} else
 		bth0 = IB_OPCODE_UD_SEND_ONLY << 24;
-	} else {
-		ret = -EINVAL;
-		goto bail;
-	}
 	lrh0 |= ah_attr->sl << 4;
 	if (qp->ibqp.qp_type == IB_QPT_SMI)
 		lrh0 |= 0xF000;	/* Set VL (see ch. 13.5.3.1) */
 	qp->s_hdr.lrh[0] = cpu_to_be16(lrh0);
 	qp->s_hdr.lrh[1] = cpu_to_be16(ah_attr->dlid);	/* DEST LID */
-	qp->s_hdr.lrh[2] = cpu_to_be16(hwords + nwords + SIZE_OF_CRC);
+	qp->s_hdr.lrh[2] = cpu_to_be16(qp->s_hdrwords + nwords +
+					   SIZE_OF_CRC);
 	lid = dev->dd->ipath_lid;
 	if (lid) {
 		lid |= ah_attr->src_path_bits &
-			((1 << (dev->mkeyprot_resv_lmc & 7)) - 1);
+			((1 << dev->dd->ipath_lmc) - 1);
 		qp->s_hdr.lrh[3] = cpu_to_be16(lid);
 	} else
 		qp->s_hdr.lrh[3] = IB_LID_PERMISSIVE;
-	if (wr->send_flags & IB_SEND_SOLICITED)
+	if (wqe->wr.send_flags & IB_SEND_SOLICITED)
 		bth0 |= 1 << 23;
 	bth0 |= extra_bytes << 20;
 	bth0 |= qp->ibqp.qp_type == IB_QPT_SMI ? IPATH_DEFAULT_P_KEY :
@@ -460,38 +357,20 @@ int ipath_post_ud_send(struct ipath_qp *qp, struct ib_send_wr *wr)
 	ohdr->bth[1] = ah_attr->dlid >= IPATH_MULTICAST_LID_BASE &&
 		ah_attr->dlid != IPATH_PERMISSIVE_LID ?
 		__constant_cpu_to_be32(IPATH_MULTICAST_QPN) :
-		cpu_to_be32(wr->wr.ud.remote_qpn);
-	/* XXX Could lose a PSN count but not worth locking */
+		cpu_to_be32(wqe->wr.wr.ud.remote_qpn);
 	ohdr->bth[2] = cpu_to_be32(qp->s_next_psn++ & IPATH_PSN_MASK);
 	/*
 	 * Qkeys with the high order bit set mean use the
 	 * qkey from the QP context instead of the WR (see 10.2.5).
 	 */
-	ohdr->u.ud.deth[0] = cpu_to_be32((int)wr->wr.ud.remote_qkey < 0 ?
-					 qp->qkey : wr->wr.ud.remote_qkey);
+	ohdr->u.ud.deth[0] = cpu_to_be32((int)wqe->wr.wr.ud.remote_qkey < 0 ?
+					 qp->qkey : wqe->wr.wr.ud.remote_qkey);
 	ohdr->u.ud.deth[1] = cpu_to_be32(qp->ibqp.qp_num);
-	if (ipath_verbs_send(dev->dd, hwords, (u32 *) &qp->s_hdr,
-			     len, &ss))
-		dev->n_no_piobuf++;
 
 done:
-	/* Queue the completion status entry. */
-	if (!(qp->s_flags & IPATH_S_SIGNAL_REQ_WR) ||
-	    (wr->send_flags & IB_SEND_SIGNALED)) {
-		wc.wr_id = wr->wr_id;
-		wc.status = IB_WC_SUCCESS;
-		wc.vendor_err = 0;
-		wc.opcode = IB_WC_SEND;
-		wc.byte_len = len;
-		wc.qp = &qp->ibqp;
-		wc.src_qp = 0;
-		wc.wc_flags = 0;
-		/* XXX initialize other fields? */
-		ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 0);
-	}
-	kfree(sg_list);
-
-	ret = 0;
+	if (++qp->s_cur >= qp->s_size)
+		qp->s_cur = 0;
+	ret = 1;
 
 bail:
 	return ret;
@@ -672,7 +551,8 @@ void ipath_ud_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 	 * Save the LMC lower bits if the destination LID is a unicast LID.
 	 */
 	wc.dlid_path_bits = dlid >= IPATH_MULTICAST_LID_BASE ? 0 :
-		dlid & ((1 << (dev->mkeyprot_resv_lmc & 7)) - 1);
+		dlid & ((1 << dev->dd->ipath_lmc) - 1);
+	wc.port_num = 1;
 	/* Signal completion event if the solicited bit is set. */
 	ipath_cq_enter(to_icq(qp->ibqp.recv_cq), &wc,
 		       (ohdr->bth[0] &
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.c b/drivers/infiniband/hw/ipath/ipath_verbs.c
index 16aa61fd8085..74f77e7c2c1b 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.c
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.c
@@ -230,6 +230,121 @@ void ipath_skip_sge(struct ipath_sge_state *ss, u32 length)
 	}
 }
 
+static void ipath_flush_wqe(struct ipath_qp *qp, struct ib_send_wr *wr)
+{
+	struct ib_wc wc;
+
+	memset(&wc, 0, sizeof(wc));
+	wc.wr_id = wr->wr_id;
+	wc.status = IB_WC_WR_FLUSH_ERR;
+	wc.opcode = ib_ipath_wc_opcode[wr->opcode];
+	wc.qp = &qp->ibqp;
+	ipath_cq_enter(to_icq(qp->ibqp.send_cq), &wc, 1);
+}
+
+/**
+ * ipath_post_one_send - post one RC, UC, or UD send work request
+ * @qp: the QP to post on
+ * @wr: the work request to send
+ */
+static int ipath_post_one_send(struct ipath_qp *qp, struct ib_send_wr *wr)
+{
+	struct ipath_swqe *wqe;
+	u32 next;
+	int i;
+	int j;
+	int acc;
+	int ret;
+	unsigned long flags;
+
+	spin_lock_irqsave(&qp->s_lock, flags);
+
+	/* Check that state is OK to post send. */
+	if (unlikely(!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK))) {
+		if (qp->state != IB_QPS_SQE && qp->state != IB_QPS_ERR)
+			goto bail_inval;
+		/* C10-96 says generate a flushed completion entry. */
+		ipath_flush_wqe(qp, wr);
+		ret = 0;
+		goto bail;
+	}
+
+	/* IB spec says that num_sge == 0 is OK. */
+	if (wr->num_sge > qp->s_max_sge)
+		goto bail_inval;
+
+	/*
+	 * Don't allow RDMA reads or atomic operations on UC or
+	 * undefined operations.
+	 * Make sure buffer is large enough to hold the result for atomics.
+	 */
+	if (qp->ibqp.qp_type == IB_QPT_UC) {
+		if ((unsigned) wr->opcode >= IB_WR_RDMA_READ)
+			goto bail_inval;
+	} else if (qp->ibqp.qp_type == IB_QPT_UD) {
+		/* Check UD opcode */
+		if (wr->opcode != IB_WR_SEND &&
+		    wr->opcode != IB_WR_SEND_WITH_IMM)
+			goto bail_inval;
+		/* Check UD destination address PD */
+		if (qp->ibqp.pd != wr->wr.ud.ah->pd)
+			goto bail_inval;
+	} else if ((unsigned) wr->opcode > IB_WR_ATOMIC_FETCH_AND_ADD)
+		goto bail_inval;
+	else if (wr->opcode >= IB_WR_ATOMIC_CMP_AND_SWP &&
+		   (wr->num_sge == 0 ||
+		    wr->sg_list[0].length < sizeof(u64) ||
+		    wr->sg_list[0].addr & (sizeof(u64) - 1)))
+		goto bail_inval;
+	else if (wr->opcode >= IB_WR_RDMA_READ && !qp->s_max_rd_atomic)
+		goto bail_inval;
+
+	next = qp->s_head + 1;
+	if (next >= qp->s_size)
+		next = 0;
+	if (next == qp->s_last)
+		goto bail_inval;
+
+	wqe = get_swqe_ptr(qp, qp->s_head);
+	wqe->wr = *wr;
+	wqe->ssn = qp->s_ssn++;
+	wqe->length = 0;
+	if (wr->num_sge) {
+		acc = wr->opcode >= IB_WR_RDMA_READ ?
+			IB_ACCESS_LOCAL_WRITE : 0;
+		for (i = 0, j = 0; i < wr->num_sge; i++) {
+			u32 length = wr->sg_list[i].length;
+			int ok;
+
+			if (length == 0)
+				continue;
+			ok = ipath_lkey_ok(qp, &wqe->sg_list[j],
+					   &wr->sg_list[i], acc);
+			if (!ok)
+				goto bail_inval;
+			wqe->length += length;
+			j++;
+		}
+		wqe->wr.num_sge = j;
+	}
+	if (qp->ibqp.qp_type == IB_QPT_UC ||
+	    qp->ibqp.qp_type == IB_QPT_RC) {
+		if (wqe->length > 0x80000000U)
+			goto bail_inval;
+	} else if (wqe->length > to_idev(qp->ibqp.device)->dd->ipath_ibmtu)
+		goto bail_inval;
+	qp->s_head = next;
+
+	ret = 0;
+	goto bail;
+
+bail_inval:
+	ret = -EINVAL;
+bail:
+	spin_unlock_irqrestore(&qp->s_lock, flags);
+	return ret;
+}
+
 /**
  * ipath_post_send - post a send on a QP
  * @ibqp: the QP to post the send on
@@ -244,35 +359,17 @@ static int ipath_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 	struct ipath_qp *qp = to_iqp(ibqp);
 	int err = 0;
 
-	/* Check that state is OK to post send. */
-	if (!(ib_ipath_state_ops[qp->state] & IPATH_POST_SEND_OK)) {
-		*bad_wr = wr;
-		err = -EINVAL;
-		goto bail;
-	}
-
 	for (; wr; wr = wr->next) {
-		switch (qp->ibqp.qp_type) {
-		case IB_QPT_UC:
-		case IB_QPT_RC:
-			err = ipath_post_ruc_send(qp, wr);
-			break;
-
-		case IB_QPT_SMI:
-		case IB_QPT_GSI:
-		case IB_QPT_UD:
-			err = ipath_post_ud_send(qp, wr);
-			break;
-
-		default:
-			err = -EINVAL;
-		}
+		err = ipath_post_one_send(qp, wr);
 		if (err) {
 			*bad_wr = wr;
-			break;
+			goto bail;
 		}
 	}
 
+	/* Try to do the send work in the caller's context. */
+	ipath_do_send((unsigned long) qp);
+
 bail:
 	return err;
 }
@@ -416,7 +513,7 @@ void ipath_ib_rcv(struct ipath_ibdev *dev, void *rhdr, void *data,
 	/* Check for a valid destination LID (see ch. 7.11.1). */
 	lid = be16_to_cpu(hdr->lrh[1]);
 	if (lid < IPATH_MULTICAST_LID_BASE) {
-		lid &= ~((1 << (dev->mkeyprot_resv_lmc & 7)) - 1);
+		lid &= ~((1 << dev->dd->ipath_lmc) - 1);
 		if (unlikely(lid != dev->dd->ipath_lid)) {
 			dev->rcv_errors++;
 			goto bail;
@@ -631,7 +728,7 @@ static inline u32 clear_upper_bytes(u32 data, u32 n, u32 off)
 #endif
 
 static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss,
-		    u32 length)
+		    u32 length, unsigned flush_wc)
 {
 	u32 extra = 0;
 	u32 data = 0;
@@ -641,11 +738,11 @@ static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss,
 		u32 len = ss->sge.length;
 		u32 off;
 
-		BUG_ON(len == 0);
 		if (len > length)
 			len = length;
 		if (len > ss->sge.sge_length)
 			len = ss->sge.sge_length;
+		BUG_ON(len == 0);
 		/* If the source address is not aligned, try to align it. */
 		off = (unsigned long)ss->sge.vaddr & (sizeof(u32) - 1);
 		if (off) {
@@ -757,36 +854,25 @@ static void copy_io(u32 __iomem *piobuf, struct ipath_sge_state *ss,
 	}
 	/* Update address before sending packet. */
 	update_sge(ss, length);
-	/* must flush early everything before trigger word */
-	ipath_flush_wc();
-	__raw_writel(last, piobuf);
-	/* be sure trigger word is written */
-	ipath_flush_wc();
+	if (flush_wc) {
+		/* must flush early everything before trigger word */
+		ipath_flush_wc();
+		__raw_writel(last, piobuf);
+		/* be sure trigger word is written */
+		ipath_flush_wc();
+	} else
+		__raw_writel(last, piobuf);
 }
 
-/**
- * ipath_verbs_send - send a packet
- * @dd: the infinipath device
- * @hdrwords: the number of words in the header
- * @hdr: the packet header
- * @len: the length of the packet in bytes
- * @ss: the SGE to send
- */
-int ipath_verbs_send(struct ipath_devdata *dd, u32 hdrwords,
-		     u32 *hdr, u32 len, struct ipath_sge_state *ss)
+static int ipath_verbs_send_pio(struct ipath_qp *qp, u32 *hdr, u32 hdrwords,
+				struct ipath_sge_state *ss, u32 len,
+				u32 plen, u32 dwords)
 {
+	struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
 	u32 __iomem *piobuf;
-	u32 plen;
+	unsigned flush_wc;
 	int ret;
 
-	/* +1 is for the qword padding of pbc */
-	plen = hdrwords + ((len + 3) >> 2) + 1;
-	if (unlikely((plen << 2) > dd->ipath_ibmaxlen)) {
-		ret = -EINVAL;
-		goto bail;
-	}
-
-	/* Get a PIO buffer to use. */
 	piobuf = ipath_getpiobuf(dd, NULL);
 	if (unlikely(piobuf == NULL)) {
 		ret = -EBUSY;
@@ -799,51 +885,90 @@ int ipath_verbs_send(struct ipath_devdata *dd, u32 hdrwords,
 	 * or WC buffer can be written out of order.
 	 */
 	writeq(plen, piobuf);
-	ipath_flush_wc();
 	piobuf += 2;
+
+	flush_wc = dd->ipath_flags & IPATH_PIO_FLUSH_WC;
 	if (len == 0) {
 		/*
 		 * If there is just the header portion, must flush before
 		 * writing last word of header for correctness, and after
 		 * the last header word (trigger word).
 		 */
-		__iowrite32_copy(piobuf, hdr, hdrwords - 1);
-		ipath_flush_wc();
-		__raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1);
-		ipath_flush_wc();
-		ret = 0;
-		goto bail;
+		if (flush_wc) {
+			ipath_flush_wc();
+			__iowrite32_copy(piobuf, hdr, hdrwords - 1);
+			ipath_flush_wc();
+			__raw_writel(hdr[hdrwords - 1], piobuf + hdrwords - 1);
+			ipath_flush_wc();
+		} else
+			__iowrite32_copy(piobuf, hdr, hdrwords);
+		goto done;
 	}
 
+	if (flush_wc)
+		ipath_flush_wc();
 	__iowrite32_copy(piobuf, hdr, hdrwords);
 	piobuf += hdrwords;
 
 	/* The common case is aligned and contained in one segment. */
 	if (likely(ss->num_sge == 1 && len <= ss->sge.length &&
 		   !((unsigned long)ss->sge.vaddr & (sizeof(u32) - 1)))) {
-		u32 w;
 		u32 *addr = (u32 *) ss->sge.vaddr;
 
 		/* Update address before sending packet. */
 		update_sge(ss, len);
-		/* Need to round up for the last dword in the packet. */
-		w = (len + 3) >> 2;
-		__iowrite32_copy(piobuf, addr, w - 1);
-		/* must flush early everything before trigger word */
-		ipath_flush_wc();
-		__raw_writel(addr[w - 1], piobuf + w - 1);
-		/* be sure trigger word is written */
-		ipath_flush_wc();
-		ret = 0;
-		goto bail;
+		if (flush_wc) {
+			__iowrite32_copy(piobuf, addr, dwords - 1);
+			/* must flush early everything before trigger word */
+			ipath_flush_wc();
+			__raw_writel(addr[dwords - 1], piobuf + dwords - 1);
+			/* be sure trigger word is written */
+			ipath_flush_wc();
+		} else
+			__iowrite32_copy(piobuf, addr, dwords);
+		goto done;
 	}
-	copy_io(piobuf, ss, len);
+	copy_io(piobuf, ss, len, flush_wc);
+done:
+	if (qp->s_wqe)
+		ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
 	ret = 0;
-
 bail:
 	return ret;
 }
 
+/**
+ * ipath_verbs_send - send a packet
+ * @qp: the QP to send on
+ * @hdr: the packet header
+ * @hdrwords: the number of words in the header
+ * @ss: the SGE to send
+ * @len: the length of the packet in bytes
+ */
+int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
+		     u32 hdrwords, struct ipath_sge_state *ss, u32 len)
+{
+	struct ipath_devdata *dd = to_idev(qp->ibqp.device)->dd;
+	u32 plen;
+	int ret;
+	u32 dwords = (len + 3) >> 2;
+
+	/* +1 is for the qword padding of pbc */
+	plen = hdrwords + dwords + 1;
+
+	/* Drop non-VL15 packets if we are not in the active state */
+	if (!(dd->ipath_flags & IPATH_LINKACTIVE) &&
+	    qp->ibqp.qp_type != IB_QPT_SMI) {
+		if (qp->s_wqe)
+			ipath_send_complete(qp, qp->s_wqe, IB_WC_SUCCESS);
+		ret = 0;
+	} else
+		ret = ipath_verbs_send_pio(qp, (u32 *) hdr, hdrwords,
+					   ss, len, plen, dwords);
+
+	return ret;
+}
+
 int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
 			    u64 *rwords, u64 *spkts, u64 *rpkts,
 			    u64 *xmit_wait)
@@ -852,7 +977,6 @@ int ipath_snapshot_counters(struct ipath_devdata *dd, u64 *swords,
 
 	if (!(dd->ipath_flags & IPATH_INITTED)) {
 		/* no hardware, freeze, etc. */
-		ipath_dbg("unit %u not usable\n", dd->ipath_unit);
 		ret = -EINVAL;
 		goto bail;
 	}
@@ -878,48 +1002,44 @@ bail:
 int ipath_get_counters(struct ipath_devdata *dd,
 		       struct ipath_verbs_counters *cntrs)
 {
+	struct ipath_cregs const *crp = dd->ipath_cregs;
 	int ret;
 
 	if (!(dd->ipath_flags & IPATH_INITTED)) {
 		/* no hardware, freeze, etc. */
-		ipath_dbg("unit %u not usable\n", dd->ipath_unit);
 		ret = -EINVAL;
 		goto bail;
 	}
 	cntrs->symbol_error_counter =
-		ipath_snap_cntr(dd, dd->ipath_cregs->cr_ibsymbolerrcnt);
+		ipath_snap_cntr(dd, crp->cr_ibsymbolerrcnt);
 	cntrs->link_error_recovery_counter =
-		ipath_snap_cntr(dd, dd->ipath_cregs->cr_iblinkerrrecovcnt);
+		ipath_snap_cntr(dd, crp->cr_iblinkerrrecovcnt);
 	/*
 	 * The link downed counter counts when the other side downs the
 	 * connection.  We add in the number of times we downed the link
 	 * due to local link integrity errors to compensate.
 	 */
 	cntrs->link_downed_counter =
-		ipath_snap_cntr(dd, dd->ipath_cregs->cr_iblinkdowncnt);
+		ipath_snap_cntr(dd, crp->cr_iblinkdowncnt);
 	cntrs->port_rcv_errors =
-		ipath_snap_cntr(dd, dd->ipath_cregs->cr_rxdroppktcnt) +
-		ipath_snap_cntr(dd, dd->ipath_cregs->cr_rcvovflcnt) +
-		ipath_snap_cntr(dd, dd->ipath_cregs->cr_portovflcnt) +
-		ipath_snap_cntr(dd, dd->ipath_cregs->cr_err_rlencnt) +
-		ipath_snap_cntr(dd, dd->ipath_cregs->cr_invalidrlencnt) +
-		ipath_snap_cntr(dd, dd->ipath_cregs->cr_erricrccnt) +
-		ipath_snap_cntr(dd, dd->ipath_cregs->cr_errvcrccnt) +
-		ipath_snap_cntr(dd, dd->ipath_cregs->cr_errlpcrccnt) +
-		ipath_snap_cntr(dd, dd->ipath_cregs->cr_badformatcnt) +
+		ipath_snap_cntr(dd, crp->cr_rxdroppktcnt) +
+		ipath_snap_cntr(dd, crp->cr_rcvovflcnt) +
+		ipath_snap_cntr(dd, crp->cr_portovflcnt) +
+		ipath_snap_cntr(dd, crp->cr_err_rlencnt) +
+		ipath_snap_cntr(dd, crp->cr_invalidrlencnt) +
+		ipath_snap_cntr(dd, crp->cr_errlinkcnt) +
+		ipath_snap_cntr(dd, crp->cr_erricrccnt) +
+		ipath_snap_cntr(dd, crp->cr_errvcrccnt) +
+		ipath_snap_cntr(dd, crp->cr_errlpcrccnt) +
+		ipath_snap_cntr(dd, crp->cr_badformatcnt) +
 		dd->ipath_rxfc_unsupvl_errs;
 	cntrs->port_rcv_remphys_errors =
-		ipath_snap_cntr(dd, dd->ipath_cregs->cr_rcvebpcnt);
-	cntrs->port_xmit_discards =
-		ipath_snap_cntr(dd, dd->ipath_cregs->cr_unsupvlcnt);
-	cntrs->port_xmit_data =
-		ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordsendcnt);
-	cntrs->port_rcv_data =
-		ipath_snap_cntr(dd, dd->ipath_cregs->cr_wordrcvcnt);
-	cntrs->port_xmit_packets =
-		ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktsendcnt);
-	cntrs->port_rcv_packets =
-		ipath_snap_cntr(dd, dd->ipath_cregs->cr_pktrcvcnt);
+		ipath_snap_cntr(dd, crp->cr_rcvebpcnt);
+	cntrs->port_xmit_discards = ipath_snap_cntr(dd, crp->cr_unsupvlcnt);
+	cntrs->port_xmit_data = ipath_snap_cntr(dd, crp->cr_wordsendcnt);
+	cntrs->port_rcv_data = ipath_snap_cntr(dd, crp->cr_wordrcvcnt);
+	cntrs->port_xmit_packets = ipath_snap_cntr(dd, crp->cr_pktsendcnt);
+	cntrs->port_rcv_packets = ipath_snap_cntr(dd, crp->cr_pktrcvcnt);
 	cntrs->local_link_integrity_errors =
 		(dd->ipath_flags & IPATH_GPIO_ERRINTRS) ?
 		dd->ipath_lli_errs : dd->ipath_lli_errors;
@@ -1033,25 +1153,26 @@ static int ipath_query_port(struct ib_device *ibdev,
 			    u8 port, struct ib_port_attr *props)
 {
 	struct ipath_ibdev *dev = to_idev(ibdev);
+	struct ipath_devdata *dd = dev->dd;
 	enum ib_mtu mtu;
-	u16 lid = dev->dd->ipath_lid;
+	u16 lid = dd->ipath_lid;
 	u64 ibcstat;
 
 	memset(props, 0, sizeof(*props));
 	props->lid = lid ? lid : __constant_be16_to_cpu(IB_LID_PERMISSIVE);
-	props->lmc = dev->mkeyprot_resv_lmc & 7;
+	props->lmc = dd->ipath_lmc;
 	props->sm_lid = dev->sm_lid;
 	props->sm_sl = dev->sm_sl;
-	ibcstat = dev->dd->ipath_lastibcstat;
+	ibcstat = dd->ipath_lastibcstat;
 	props->state = ((ibcstat >> 4) & 0x3) + 1;
 	/* See phys_state_show() */
 	props->phys_state = ipath_cvt_physportstate[
-		dev->dd->ipath_lastibcstat & 0xf];
+		dd->ipath_lastibcstat & 0xf];
 	props->port_cap_flags = dev->port_cap_flags;
 	props->gid_tbl_len = 1;
 	props->max_msg_sz = 0x80000000;
-	props->pkey_tbl_len = ipath_get_npkeys(dev->dd);
-	props->bad_pkey_cntr = ipath_get_cr_errpkey(dev->dd) -
+	props->pkey_tbl_len = ipath_get_npkeys(dd);
+	props->bad_pkey_cntr = ipath_get_cr_errpkey(dd) -
 		dev->z_pkey_violations;
 	props->qkey_viol_cntr = dev->qkey_violations;
 	props->active_width = IB_WIDTH_4X;
@@ -1061,12 +1182,12 @@ static int ipath_query_port(struct ib_device *ibdev,
 	props->init_type_reply = 0;
 
 	/*
-	 * Note: the chips support a maximum MTU of 4096, but the driver
+	 * Note: the chip supports a maximum MTU of 4096, but the driver
 	 * hasn't implemented this feature yet, so set the maximum value
 	 * to 2048.
 	 */
 	props->max_mtu = IB_MTU_2048;
-	switch (dev->dd->ipath_ibmtu) {
+	switch (dd->ipath_ibmtu) {
 	case 4096:
 		mtu = IB_MTU_4096;
 		break;
@@ -1415,9 +1536,7 @@ static int disable_timer(struct ipath_devdata *dd)
 {
 	/* Disable GPIO bit 2 interrupt */
 	if (dd->ipath_flags & IPATH_GPIO_INTR) {
-                u64 val;
                 /* Disable GPIO bit 2 interrupt */
-                val = ipath_read_kreg64(dd, dd->ipath_kregs->kr_gpio_mask);
 		dd->ipath_gpio_mask &= ~((u64) (1 << IPATH_GPIO_PORT0_BIT));
 		ipath_write_kreg(dd, dd->ipath_kregs->kr_gpio_mask,
 				 dd->ipath_gpio_mask);
diff --git a/drivers/infiniband/hw/ipath/ipath_verbs.h b/drivers/infiniband/hw/ipath/ipath_verbs.h
index 1a24c6a4a814..6ccb54f104a3 100644
--- a/drivers/infiniband/hw/ipath/ipath_verbs.h
+++ b/drivers/infiniband/hw/ipath/ipath_verbs.h
@@ -42,6 +42,8 @@
 #include <rdma/ib_pack.h>
 #include <rdma/ib_user_verbs.h>
 
+#include "ipath_kernel.h"
+
 #define IPATH_MAX_RDMA_ATOMIC	4
 
 #define QPN_MAX                 (1 << 24)
@@ -59,6 +61,7 @@
  */
 #define IB_CQ_NONE	(IB_CQ_NEXT_COMP + 1)
 
+/* AETH NAK opcode values */
 #define IB_RNR_NAK			0x20
 #define IB_NAK_PSN_ERROR		0x60
 #define IB_NAK_INVALID_REQUEST		0x61
@@ -66,6 +69,7 @@
 #define IB_NAK_REMOTE_OPERATIONAL_ERROR 0x63
 #define IB_NAK_INVALID_RD_REQUEST	0x64
 
+/* Flags for checking QP state (see ib_ipath_state_ops[]) */
 #define IPATH_POST_SEND_OK		0x01
 #define IPATH_POST_RECV_OK		0x02
 #define IPATH_PROCESS_RECV_OK		0x04
@@ -187,7 +191,11 @@ struct ipath_mmap_info {
 struct ipath_cq_wc {
 	u32 head;		/* index of next entry to fill */
 	u32 tail;		/* index of next ib_poll_cq() entry */
-	struct ib_uverbs_wc queue[1]; /* this is actually size ibcq.cqe + 1 */
+	union {
+		/* these are actually size ibcq.cqe + 1 */
+		struct ib_uverbs_wc uqueue[0];
+		struct ib_wc kqueue[0];
+	};
 };
 
 /*
@@ -239,7 +247,7 @@ struct ipath_mregion {
  */
 struct ipath_sge {
 	struct ipath_mregion *mr;
-	void *vaddr;		/* current pointer into the segment */
+	void *vaddr;		/* kernel virtual address of segment */
 	u32 sge_length;		/* length of the SGE */
 	u32 length;		/* remaining length of the segment */
 	u16 m;			/* current index: mr->map[m] */
@@ -407,6 +415,7 @@ struct ipath_qp {
 	u32 s_ssn;		/* SSN of tail entry */
 	u32 s_lsn;		/* limit sequence number (credit) */
 	struct ipath_swqe *s_wq;	/* send work queue */
+	struct ipath_swqe *s_wqe;
 	struct ipath_rq r_rq;		/* receive work queue */
 	struct ipath_sge r_sg_list[0];	/* verified SGEs */
 };
@@ -492,7 +501,7 @@ struct ipath_ibdev {
 	int ib_unit;		/* This is the device number */
 	u16 sm_lid;		/* in host order */
 	u8 sm_sl;
-	u8 mkeyprot_resv_lmc;
+	u8 mkeyprot;
 	/* non-zero when timer is set */
 	unsigned long mkey_lease_timeout;
 
@@ -667,7 +676,7 @@ struct ib_qp *ipath_create_qp(struct ib_pd *ibpd,
 
 int ipath_destroy_qp(struct ib_qp *ibqp);
 
-void ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err);
+int ipath_error_qp(struct ipath_qp *qp, enum ib_wc_status err);
 
 int ipath_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr,
 		    int attr_mask, struct ib_udata *udata);
@@ -683,8 +692,8 @@ void ipath_sqerror_qp(struct ipath_qp *qp, struct ib_wc *wc);
 
 void ipath_get_credit(struct ipath_qp *qp, u32 aeth);
 
-int ipath_verbs_send(struct ipath_devdata *dd, u32 hdrwords,
-		     u32 *hdr, u32 len, struct ipath_sge_state *ss);
+int ipath_verbs_send(struct ipath_qp *qp, struct ipath_ib_header *hdr,
+		     u32 hdrwords, struct ipath_sge_state *ss, u32 len);
 
 void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig);
 
@@ -692,8 +701,6 @@ void ipath_copy_sge(struct ipath_sge_state *ss, void *data, u32 length);
 
 void ipath_skip_sge(struct ipath_sge_state *ss, u32 length);
 
-int ipath_post_ruc_send(struct ipath_qp *qp, struct ib_send_wr *wr);
-
 void ipath_uc_rcv(struct ipath_ibdev *dev, struct ipath_ib_header *hdr,
 		  int has_grh, void *data, u32 tlen, struct ipath_qp *qp);
 
@@ -733,6 +740,8 @@ int ipath_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *attr);
 
 int ipath_destroy_srq(struct ib_srq *ibsrq);
 
+void ipath_cq_enter(struct ipath_cq *cq, struct ib_wc *entry, int sig);
+
 int ipath_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *entry);
 
 struct ib_cq *ipath_create_cq(struct ib_device *ibdev, int entries, int comp_vector,
@@ -782,18 +791,28 @@ int ipath_mmap(struct ib_ucontext *context, struct vm_area_struct *vma);
 
 void ipath_insert_rnr_queue(struct ipath_qp *qp);
 
+int ipath_init_sge(struct ipath_qp *qp, struct ipath_rwqe *wqe,
+		   u32 *lengthp, struct ipath_sge_state *ss);
+
 int ipath_get_rwqe(struct ipath_qp *qp, int wr_id_only);
 
 u32 ipath_make_grh(struct ipath_ibdev *dev, struct ib_grh *hdr,
 		   struct ib_global_route *grh, u32 hwords, u32 nwords);
 
-void ipath_do_ruc_send(unsigned long data);
+void ipath_make_ruc_header(struct ipath_ibdev *dev, struct ipath_qp *qp,
+			   struct ipath_other_headers *ohdr,
+			   u32 bth0, u32 bth2);
+
+void ipath_do_send(unsigned long data);
+
+void ipath_send_complete(struct ipath_qp *qp, struct ipath_swqe *wqe,
+			 enum ib_wc_status status);
+
+int ipath_make_rc_req(struct ipath_qp *qp);
 
-int ipath_make_rc_req(struct ipath_qp *qp, struct ipath_other_headers *ohdr,
-		      u32 pmtu, u32 *bth0p, u32 *bth2p);
+int ipath_make_uc_req(struct ipath_qp *qp);
 
-int ipath_make_uc_req(struct ipath_qp *qp, struct ipath_other_headers *ohdr,
-		      u32 pmtu, u32 *bth0p, u32 *bth2p);
+int ipath_make_ud_req(struct ipath_qp *qp);
 
 int ipath_register_ib_device(struct ipath_devdata *);
 
diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c
index dde8fe9af47e..d8287d9db41e 100644
--- a/drivers/infiniband/hw/mlx4/main.c
+++ b/drivers/infiniband/hw/mlx4/main.c
@@ -476,9 +476,48 @@ out:
 	return err;
 }
 
+static ssize_t show_hca(struct class_device *cdev, char *buf)
+{
+	struct mlx4_ib_dev *dev = container_of(cdev, struct mlx4_ib_dev, ib_dev.class_dev);
+	return sprintf(buf, "MT%d\n", dev->dev->pdev->device);
+}
+
+static ssize_t show_fw_ver(struct class_device *cdev, char *buf)
+{
+	struct mlx4_ib_dev *dev = container_of(cdev, struct mlx4_ib_dev, ib_dev.class_dev);
+	return sprintf(buf, "%d.%d.%d\n", (int) (dev->dev->caps.fw_ver >> 32),
+		       (int) (dev->dev->caps.fw_ver >> 16) & 0xffff,
+		       (int) dev->dev->caps.fw_ver & 0xffff);
+}
+
+static ssize_t show_rev(struct class_device *cdev, char *buf)
+{
+	struct mlx4_ib_dev *dev = container_of(cdev, struct mlx4_ib_dev, ib_dev.class_dev);
+	return sprintf(buf, "%x\n", dev->dev->rev_id);
+}
+
+static ssize_t show_board(struct class_device *cdev, char *buf)
+{
+	struct mlx4_ib_dev *dev = container_of(cdev, struct mlx4_ib_dev, ib_dev.class_dev);
+	return sprintf(buf, "%.*s\n", MLX4_BOARD_ID_LEN, dev->dev->board_id);
+}
+
+static CLASS_DEVICE_ATTR(hw_rev,   S_IRUGO, show_rev,    NULL);
+static CLASS_DEVICE_ATTR(fw_ver,   S_IRUGO, show_fw_ver, NULL);
+static CLASS_DEVICE_ATTR(hca_type, S_IRUGO, show_hca,    NULL);
+static CLASS_DEVICE_ATTR(board_id, S_IRUGO, show_board,  NULL);
+
+static struct class_device_attribute *mlx4_class_attributes[] = {
+	&class_device_attr_hw_rev,
+	&class_device_attr_fw_ver,
+	&class_device_attr_hca_type,
+	&class_device_attr_board_id
+};
+
 static void *mlx4_ib_add(struct mlx4_dev *dev)
 {
 	struct mlx4_ib_dev *ibdev;
+	int i;
 
 	ibdev = (struct mlx4_ib_dev *) ib_alloc_device(sizeof *ibdev);
 	if (!ibdev) {
@@ -568,6 +607,11 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	ibdev->ib_dev.detach_mcast	= mlx4_ib_mcg_detach;
 	ibdev->ib_dev.process_mad	= mlx4_ib_process_mad;
 
+	ibdev->ib_dev.alloc_fmr		= mlx4_ib_fmr_alloc;
+	ibdev->ib_dev.map_phys_fmr	= mlx4_ib_map_phys_fmr;
+	ibdev->ib_dev.unmap_fmr		= mlx4_ib_unmap_fmr;
+	ibdev->ib_dev.dealloc_fmr	= mlx4_ib_fmr_dealloc;
+
 	if (init_node_data(ibdev))
 		goto err_map;
 
@@ -580,6 +624,12 @@ static void *mlx4_ib_add(struct mlx4_dev *dev)
 	if (mlx4_ib_mad_init(ibdev))
 		goto err_reg;
 
+	for (i = 0; i < ARRAY_SIZE(mlx4_class_attributes); ++i) {
+		if (class_device_create_file(&ibdev->ib_dev.class_dev,
+					       mlx4_class_attributes[i]))
+			goto err_reg;
+	}
+
 	return ibdev;
 
 err_reg:
diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h
index 705ff2fa237e..28697653a370 100644
--- a/drivers/infiniband/hw/mlx4/mlx4_ib.h
+++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h
@@ -93,6 +93,11 @@ struct mlx4_ib_mr {
 	struct ib_umem	       *umem;
 };
 
+struct mlx4_ib_fmr {
+	struct ib_fmr           ibfmr;
+	struct mlx4_fmr         mfmr;
+};
+
 struct mlx4_ib_wq {
 	u64		       *wrid;
 	spinlock_t		lock;
@@ -199,6 +204,10 @@ static inline struct mlx4_ib_mr *to_mmr(struct ib_mr *ibmr)
 	return container_of(ibmr, struct mlx4_ib_mr, ibmr);
 }
 
+static inline struct mlx4_ib_fmr *to_mfmr(struct ib_fmr *ibfmr)
+{
+	return container_of(ibfmr, struct mlx4_ib_fmr, ibfmr);
+}
 static inline struct mlx4_ib_qp *to_mqp(struct ib_qp *ibqp)
 {
 	return container_of(ibqp, struct mlx4_ib_qp, ibqp);
@@ -284,6 +293,13 @@ int mlx4_ib_process_mad(struct ib_device *ibdev, int mad_flags,	u8 port_num,
 int mlx4_ib_mad_init(struct mlx4_ib_dev *dev);
 void mlx4_ib_mad_cleanup(struct mlx4_ib_dev *dev);
 
+struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int mr_access_flags,
+				  struct ib_fmr_attr *fmr_attr);
+int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list, int npages,
+			 u64 iova);
+int mlx4_ib_unmap_fmr(struct list_head *fmr_list);
+int mlx4_ib_fmr_dealloc(struct ib_fmr *fmr);
+
 static inline int mlx4_ib_ah_grh_present(struct mlx4_ib_ah *ah)
 {
 	return !!(ah->av.g_slid & 0x80);
diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c
index 85ae906f1d12..7dc91a3e712d 100644
--- a/drivers/infiniband/hw/mlx4/mr.c
+++ b/drivers/infiniband/hw/mlx4/mr.c
@@ -96,11 +96,10 @@ int mlx4_ib_umem_write_mtt(struct mlx4_ib_dev *dev, struct mlx4_mtt *mtt,
 				pages[i++] = sg_dma_address(&chunk->page_list[j]) +
 					umem->page_size * k;
 				/*
-				 * Be friendly to WRITE_MTT firmware
-				 * command, and pass it chunks of
-				 * appropriate size.
+				 * Be friendly to mlx4_write_mtt() and
+				 * pass it chunks of appropriate size.
 				 */
-				if (i == PAGE_SIZE / sizeof (u64) - 2) {
+				if (i == PAGE_SIZE / sizeof (u64)) {
 					err = mlx4_write_mtt(dev->dev, mtt, n,
 							     i, pages);
 					if (err)
@@ -182,3 +181,96 @@ int mlx4_ib_dereg_mr(struct ib_mr *ibmr)
 
 	return 0;
 }
+
+struct ib_fmr *mlx4_ib_fmr_alloc(struct ib_pd *pd, int acc,
+				 struct ib_fmr_attr *fmr_attr)
+{
+	struct mlx4_ib_dev *dev = to_mdev(pd->device);
+	struct mlx4_ib_fmr *fmr;
+	int err = -ENOMEM;
+
+	fmr = kmalloc(sizeof *fmr, GFP_KERNEL);
+	if (!fmr)
+		return ERR_PTR(-ENOMEM);
+
+	err = mlx4_fmr_alloc(dev->dev, to_mpd(pd)->pdn, convert_access(acc),
+			     fmr_attr->max_pages, fmr_attr->max_maps,
+			     fmr_attr->page_shift, &fmr->mfmr);
+	if (err)
+		goto err_free;
+
+	err = mlx4_mr_enable(to_mdev(pd->device)->dev, &fmr->mfmr.mr);
+	if (err)
+		goto err_mr;
+
+	fmr->ibfmr.rkey = fmr->ibfmr.lkey = fmr->mfmr.mr.key;
+
+	return &fmr->ibfmr;
+
+err_mr:
+	mlx4_mr_free(to_mdev(pd->device)->dev, &fmr->mfmr.mr);
+
+err_free:
+	kfree(fmr);
+
+	return ERR_PTR(err);
+}
+
+int mlx4_ib_map_phys_fmr(struct ib_fmr *ibfmr, u64 *page_list,
+		      int npages, u64 iova)
+{
+	struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
+	struct mlx4_ib_dev *dev = to_mdev(ifmr->ibfmr.device);
+
+	return mlx4_map_phys_fmr(dev->dev, &ifmr->mfmr, page_list, npages, iova,
+				 &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
+}
+
+int mlx4_ib_unmap_fmr(struct list_head *fmr_list)
+{
+	struct ib_fmr *ibfmr;
+	int err;
+	struct mlx4_dev *mdev = NULL;
+
+	list_for_each_entry(ibfmr, fmr_list, list) {
+		if (mdev && to_mdev(ibfmr->device)->dev != mdev)
+			return -EINVAL;
+		mdev = to_mdev(ibfmr->device)->dev;
+	}
+
+	if (!mdev)
+		return 0;
+
+	list_for_each_entry(ibfmr, fmr_list, list) {
+		struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
+
+		mlx4_fmr_unmap(mdev, &ifmr->mfmr, &ifmr->ibfmr.lkey, &ifmr->ibfmr.rkey);
+	}
+
+	/*
+	 * Make sure all MPT status updates are visible before issuing
+	 * SYNC_TPT firmware command.
+	 */
+	wmb();
+
+	err = mlx4_SYNC_TPT(mdev);
+	if (err)
+		printk(KERN_WARNING "mlx4_ib: SYNC_TPT error %d when "
+		       "unmapping FMRs\n", err);
+
+	return 0;
+}
+
+int mlx4_ib_fmr_dealloc(struct ib_fmr *ibfmr)
+{
+	struct mlx4_ib_fmr *ifmr = to_mfmr(ibfmr);
+	struct mlx4_ib_dev *dev = to_mdev(ibfmr->device);
+	int err;
+
+	err = mlx4_fmr_free(dev->dev, &ifmr->mfmr);
+
+	if (!err)
+		kfree(ifmr);
+
+	return err;
+}
diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c
index 85c51bdc36f1..31a480e5b0d0 100644
--- a/drivers/infiniband/hw/mlx4/qp.c
+++ b/drivers/infiniband/hw/mlx4/qp.c
@@ -1249,6 +1249,13 @@ static void set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
 	dseg->byte_count = cpu_to_be32(sg->length);
 }
 
+static void __set_data_seg(struct mlx4_wqe_data_seg *dseg, struct ib_sge *sg)
+{
+	dseg->byte_count = cpu_to_be32(sg->length);
+	dseg->lkey       = cpu_to_be32(sg->lkey);
+	dseg->addr       = cpu_to_be64(sg->addr);
+}
+
 int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr,
 		      struct ib_send_wr **bad_wr)
 {
@@ -1464,11 +1471,8 @@ int mlx4_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr,
 
 		scat = get_recv_wqe(qp, ind);
 
-		for (i = 0; i < wr->num_sge; ++i) {
-			scat[i].byte_count = cpu_to_be32(wr->sg_list[i].length);
-			scat[i].lkey       = cpu_to_be32(wr->sg_list[i].lkey);
-			scat[i].addr       = cpu_to_be64(wr->sg_list[i].addr);
-		}
+		for (i = 0; i < wr->num_sge; ++i)
+			__set_data_seg(scat + i, wr->sg_list + i);
 
 		if (i < qp->rq.max_gs) {
 			scat[i].byte_count = 0;
diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c
index 408748fb5285..e7e9a3d0dac3 100644
--- a/drivers/infiniband/hw/mlx4/srq.c
+++ b/drivers/infiniband/hw/mlx4/srq.c
@@ -251,7 +251,7 @@ int mlx4_ib_query_srq(struct ib_srq *ibsrq, struct ib_srq_attr *srq_attr)
 	if (ret)
 		return ret;
 
-	srq_attr->srq_limit = be16_to_cpu(limit_watermark);
+	srq_attr->srq_limit = limit_watermark;
 	srq_attr->max_wr    = srq->msrq.max - 1;
 	srq_attr->max_sge   = srq->msrq.max_gs;
 
diff --git a/drivers/infiniband/hw/mthca/mthca_cmd.c b/drivers/infiniband/hw/mthca/mthca_cmd.c
index acc95892713a..6966f943f440 100644
--- a/drivers/infiniband/hw/mthca/mthca_cmd.c
+++ b/drivers/infiniband/hw/mthca/mthca_cmd.c
@@ -290,6 +290,12 @@ static int mthca_cmd_post(struct mthca_dev *dev,
 		err = mthca_cmd_post_hcr(dev, in_param, out_param, in_modifier,
 					 op_modifier, op, token, event);
 
+	/*
+	 * Make sure that our HCR writes don't get mixed in with
+	 * writes from another CPU starting a FW command.
+	 */
+	mmiowb();
+
 	mutex_unlock(&dev->cmd.hcr_mutex);
 	return err;
 }
diff --git a/drivers/infiniband/hw/mthca/mthca_dev.h b/drivers/infiniband/hw/mthca/mthca_dev.h
index 9bae3cc60603..15aa32eb78b6 100644
--- a/drivers/infiniband/hw/mthca/mthca_dev.h
+++ b/drivers/infiniband/hw/mthca/mthca_dev.h
@@ -83,7 +83,7 @@ enum {
 	MTHCA_QP_CONTEXT_SIZE = 0x200,
 	MTHCA_RDB_ENTRY_SIZE  =  0x20,
 	MTHCA_AV_SIZE         =  0x20,
-	MTHCA_MGM_ENTRY_SIZE  =  0x40,
+	MTHCA_MGM_ENTRY_SIZE  = 0x100,
 
 	/* Arbel FW gives us these, but we need them for Tavor */
 	MTHCA_MPT_ENTRY_SIZE  =  0x40,
diff --git a/drivers/infiniband/hw/mthca/mthca_main.c b/drivers/infiniband/hw/mthca/mthca_main.c
index 76fed7545c53..60de6f93869e 100644
--- a/drivers/infiniband/hw/mthca/mthca_main.c
+++ b/drivers/infiniband/hw/mthca/mthca_main.c
@@ -61,7 +61,7 @@ MODULE_PARM_DESC(debug_level, "Enable debug tracing if > 0");
 
 #ifdef CONFIG_PCI_MSI
 
-static int msi_x = 0;
+static int msi_x = 1;
 module_param(msi_x, int, 0444);
 MODULE_PARM_DESC(msi_x, "attempt to use MSI-X if nonzero");
 
@@ -137,40 +137,23 @@ static const char mthca_version[] __devinitdata =
 
 static int mthca_tune_pci(struct mthca_dev *mdev)
 {
-	int cap;
-	u16 val;
-
 	if (!tune_pci)
 		return 0;
 
 	/* First try to max out Read Byte Count */
-	cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX);
-	if (cap) {
-		if (pci_read_config_word(mdev->pdev, cap + PCI_X_CMD, &val)) {
-			mthca_err(mdev, "Couldn't read PCI-X command register, "
-				  "aborting.\n");
-			return -ENODEV;
-		}
-		val = (val & ~PCI_X_CMD_MAX_READ) | (3 << 2);
-		if (pci_write_config_word(mdev->pdev, cap + PCI_X_CMD, val)) {
-			mthca_err(mdev, "Couldn't write PCI-X command register, "
-				  "aborting.\n");
+	if (pci_find_capability(mdev->pdev, PCI_CAP_ID_PCIX)) {
+		if (pcix_set_mmrbc(mdev->pdev, pcix_get_max_mmrbc(mdev->pdev))) {
+			mthca_err(mdev, "Couldn't set PCI-X max read count, "
+				"aborting.\n");
 			return -ENODEV;
 		}
 	} else if (!(mdev->mthca_flags & MTHCA_FLAG_PCIE))
 		mthca_info(mdev, "No PCI-X capability, not setting RBC.\n");
 
-	cap = pci_find_capability(mdev->pdev, PCI_CAP_ID_EXP);
-	if (cap) {
-		if (pci_read_config_word(mdev->pdev, cap + PCI_EXP_DEVCTL, &val)) {
-			mthca_err(mdev, "Couldn't read PCI Express device control "
-				  "register, aborting.\n");
-			return -ENODEV;
-		}
-		val = (val & ~PCI_EXP_DEVCTL_READRQ) | (5 << 12);
-		if (pci_write_config_word(mdev->pdev, cap + PCI_EXP_DEVCTL, val)) {
-			mthca_err(mdev, "Couldn't write PCI Express device control "
-				  "register, aborting.\n");
+	if (pci_find_capability(mdev->pdev, PCI_CAP_ID_EXP)) {
+		if (pcie_set_readrq(mdev->pdev, 4096)) {
+			mthca_err(mdev, "Couldn't write PCI Express read request, "
+				"aborting.\n");
 			return -ENODEV;
 		}
 	} else if (mdev->mthca_flags & MTHCA_FLAG_PCIE)
@@ -833,14 +816,19 @@ static int mthca_setup_hca(struct mthca_dev *dev)
 
 	err = mthca_NOP(dev, &status);
 	if (err || status) {
-		mthca_err(dev, "NOP command failed to generate interrupt (IRQ %d), aborting.\n",
-			  dev->mthca_flags & MTHCA_FLAG_MSI_X ?
-			  dev->eq_table.eq[MTHCA_EQ_CMD].msi_x_vector :
-			  dev->pdev->irq);
-		if (dev->mthca_flags & (MTHCA_FLAG_MSI | MTHCA_FLAG_MSI_X))
-			mthca_err(dev, "Try again with MSI/MSI-X disabled.\n");
-		else
+		if (dev->mthca_flags & (MTHCA_FLAG_MSI | MTHCA_FLAG_MSI_X)) {
+			mthca_warn(dev, "NOP command failed to generate interrupt "
+				   "(IRQ %d).\n",
+				   dev->mthca_flags & MTHCA_FLAG_MSI_X ?
+				   dev->eq_table.eq[MTHCA_EQ_CMD].msi_x_vector :
+				   dev->pdev->irq);
+			mthca_warn(dev, "Trying again with MSI/MSI-X disabled.\n");
+		} else {
+			mthca_err(dev, "NOP command failed to generate interrupt "
+				  "(IRQ %d), aborting.\n",
+				  dev->pdev->irq);
 			mthca_err(dev, "BIOS or ACPI interrupt routing problem?\n");
+		}
 
 		goto err_cmd_poll;
 	}
@@ -1115,24 +1103,6 @@ static int __mthca_init_one(struct pci_dev *pdev, int hca_type)
 		goto err_free_dev;
 	}
 
-	if (msi_x && !mthca_enable_msi_x(mdev))
-		mdev->mthca_flags |= MTHCA_FLAG_MSI_X;
-	else if (msi) {
-		static int warned;
-
-		if (!warned) {
-			printk(KERN_WARNING PFX "WARNING: MSI support will be "
-			       "removed from the ib_mthca driver in January 2008.\n");
-			printk(KERN_WARNING "    If you are using MSI and cannot "
-			       "switch to MSI-X, please tell "
-			       "<general@lists.openfabrics.org>.\n");
-			++warned;
-		}
-
-		if (!pci_enable_msi(pdev))
-			mdev->mthca_flags |= MTHCA_FLAG_MSI;
-	}
-
 	if (mthca_cmd_init(mdev)) {
 		mthca_err(mdev, "Failed to init command interface, aborting.\n");
 		goto err_free_dev;
@@ -1156,7 +1126,35 @@ static int __mthca_init_one(struct pci_dev *pdev, int hca_type)
 		mthca_warn(mdev, "If you have problems, try updating your HCA FW.\n");
 	}
 
+	if (msi_x && !mthca_enable_msi_x(mdev))
+		mdev->mthca_flags |= MTHCA_FLAG_MSI_X;
+	else if (msi) {
+		static int warned;
+
+		if (!warned) {
+			printk(KERN_WARNING PFX "WARNING: MSI support will be "
+			       "removed from the ib_mthca driver in January 2008.\n");
+			printk(KERN_WARNING "    If you are using MSI and cannot "
+			       "switch to MSI-X, please tell "
+			       "<general@lists.openfabrics.org>.\n");
+			++warned;
+		}
+
+		if (!pci_enable_msi(pdev))
+			mdev->mthca_flags |= MTHCA_FLAG_MSI;
+	}
+
 	err = mthca_setup_hca(mdev);
+	if (err == -EBUSY && (mdev->mthca_flags & (MTHCA_FLAG_MSI | MTHCA_FLAG_MSI_X))) {
+		if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+			pci_disable_msix(pdev);
+		if (mdev->mthca_flags & MTHCA_FLAG_MSI)
+			pci_disable_msi(pdev);
+		mdev->mthca_flags &= ~(MTHCA_FLAG_MSI_X | MTHCA_FLAG_MSI);
+
+		err = mthca_setup_hca(mdev);
+	}
+
 	if (err)
 		goto err_close;
 
@@ -1192,17 +1190,17 @@ err_cleanup:
 	mthca_cleanup_uar_table(mdev);
 
 err_close:
+	if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
+		pci_disable_msix(pdev);
+	if (mdev->mthca_flags & MTHCA_FLAG_MSI)
+		pci_disable_msi(pdev);
+
 	mthca_close_hca(mdev);
 
 err_cmd:
 	mthca_cmd_cleanup(mdev);
 
 err_free_dev:
-	if (mdev->mthca_flags & MTHCA_FLAG_MSI_X)
-		pci_disable_msix(pdev);
-	if (mdev->mthca_flags & MTHCA_FLAG_MSI)
-		pci_disable_msi(pdev);
-
 	ib_dealloc_device(&mdev->ib_dev);
 
 err_free_res:
diff --git a/drivers/infiniband/hw/mthca/mthca_srq.c b/drivers/infiniband/hw/mthca/mthca_srq.c
index 88d219e730ad..3f58c11a62b7 100644
--- a/drivers/infiniband/hw/mthca/mthca_srq.c
+++ b/drivers/infiniband/hw/mthca/mthca_srq.c
@@ -509,7 +509,7 @@ int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
 	for (nreq = 0; wr; wr = wr->next) {
 		ind = srq->first_free;
 
-		if (ind < 0) {
+		if (unlikely(ind < 0)) {
 			mthca_err(dev, "SRQ %06x full\n", srq->srqn);
 			err = -ENOMEM;
 			*bad_wr = wr;
@@ -519,7 +519,7 @@ int mthca_tavor_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
 		wqe       = get_wqe(srq, ind);
 		next_ind  = *wqe_to_link(wqe);
 
-		if (next_ind < 0) {
+		if (unlikely(next_ind < 0)) {
 			mthca_err(dev, "SRQ %06x full\n", srq->srqn);
 			err = -ENOMEM;
 			*bad_wr = wr;
@@ -623,7 +623,7 @@ int mthca_arbel_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
 	for (nreq = 0; wr; ++nreq, wr = wr->next) {
 		ind = srq->first_free;
 
-		if (ind < 0) {
+		if (unlikely(ind < 0)) {
 			mthca_err(dev, "SRQ %06x full\n", srq->srqn);
 			err = -ENOMEM;
 			*bad_wr = wr;
@@ -633,7 +633,7 @@ int mthca_arbel_post_srq_recv(struct ib_srq *ibsrq, struct ib_recv_wr *wr,
 		wqe       = get_wqe(srq, ind);
 		next_ind  = *wqe_to_link(wqe);
 
-		if (next_ind < 0) {
+		if (unlikely(next_ind < 0)) {
 			mthca_err(dev, "SRQ %06x full\n", srq->srqn);
 			err = -ENOMEM;
 			*bad_wr = wr;