summary refs log tree commit diff
path: root/arch/powerpc/kernel/iommu.c
diff options
context:
space:
mode:
Diffstat (limited to 'arch/powerpc/kernel/iommu.c')
-rw-r--r--arch/powerpc/kernel/iommu.c291
1 files changed, 232 insertions, 59 deletions
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 359f078571c7..ff5a6ce027b8 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -33,6 +33,9 @@
 #include <linux/bitmap.h>
 #include <linux/iommu-helper.h>
 #include <linux/crash_dump.h>
+#include <linux/hash.h>
+#include <linux/fault-inject.h>
+#include <linux/pci.h>
 #include <asm/io.h>
 #include <asm/prom.h>
 #include <asm/iommu.h>
@@ -40,6 +43,7 @@
 #include <asm/machdep.h>
 #include <asm/kdump.h>
 #include <asm/fadump.h>
+#include <asm/vio.h>
 
 #define DBG(...)
 
@@ -58,6 +62,114 @@ static int __init setup_iommu(char *str)
 
 __setup("iommu=", setup_iommu);
 
+static DEFINE_PER_CPU(unsigned int, iommu_pool_hash);
+
+/*
+ * We precalculate the hash to avoid doing it on every allocation.
+ *
+ * The hash is important to spread CPUs across all the pools. For example,
+ * on a POWER7 with 4 way SMT we want interrupts on the primary threads and
+ * with 4 pools all primary threads would map to the same pool.
+ */
+static int __init setup_iommu_pool_hash(void)
+{
+	unsigned int i;
+
+	for_each_possible_cpu(i)
+		per_cpu(iommu_pool_hash, i) = hash_32(i, IOMMU_POOL_HASHBITS);
+
+	return 0;
+}
+subsys_initcall(setup_iommu_pool_hash);
+
+#ifdef CONFIG_FAIL_IOMMU
+
+static DECLARE_FAULT_ATTR(fail_iommu);
+
+static int __init setup_fail_iommu(char *str)
+{
+	return setup_fault_attr(&fail_iommu, str);
+}
+__setup("fail_iommu=", setup_fail_iommu);
+
+static bool should_fail_iommu(struct device *dev)
+{
+	return dev->archdata.fail_iommu && should_fail(&fail_iommu, 1);
+}
+
+static int __init fail_iommu_debugfs(void)
+{
+	struct dentry *dir = fault_create_debugfs_attr("fail_iommu",
+						       NULL, &fail_iommu);
+
+	return IS_ERR(dir) ? PTR_ERR(dir) : 0;
+}
+late_initcall(fail_iommu_debugfs);
+
+static ssize_t fail_iommu_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	return sprintf(buf, "%d\n", dev->archdata.fail_iommu);
+}
+
+static ssize_t fail_iommu_store(struct device *dev,
+				struct device_attribute *attr, const char *buf,
+				size_t count)
+{
+	int i;
+
+	if (count > 0 && sscanf(buf, "%d", &i) > 0)
+		dev->archdata.fail_iommu = (i == 0) ? 0 : 1;
+
+	return count;
+}
+
+static DEVICE_ATTR(fail_iommu, S_IRUGO|S_IWUSR, fail_iommu_show,
+		   fail_iommu_store);
+
+static int fail_iommu_bus_notify(struct notifier_block *nb,
+				 unsigned long action, void *data)
+{
+	struct device *dev = data;
+
+	if (action == BUS_NOTIFY_ADD_DEVICE) {
+		if (device_create_file(dev, &dev_attr_fail_iommu))
+			pr_warn("Unable to create IOMMU fault injection sysfs "
+				"entries\n");
+	} else if (action == BUS_NOTIFY_DEL_DEVICE) {
+		device_remove_file(dev, &dev_attr_fail_iommu);
+	}
+
+	return 0;
+}
+
+static struct notifier_block fail_iommu_bus_notifier = {
+	.notifier_call = fail_iommu_bus_notify
+};
+
+static int __init fail_iommu_setup(void)
+{
+#ifdef CONFIG_PCI
+	bus_register_notifier(&pci_bus_type, &fail_iommu_bus_notifier);
+#endif
+#ifdef CONFIG_IBMVIO
+	bus_register_notifier(&vio_bus_type, &fail_iommu_bus_notifier);
+#endif
+
+	return 0;
+}
+/*
+ * Must execute after PCI and VIO subsystem have initialised but before
+ * devices are probed.
+ */
+arch_initcall(fail_iommu_setup);
+#else
+static inline bool should_fail_iommu(struct device *dev)
+{
+	return false;
+}
+#endif
+
 static unsigned long iommu_range_alloc(struct device *dev,
 				       struct iommu_table *tbl,
                                        unsigned long npages,
@@ -71,6 +183,9 @@ static unsigned long iommu_range_alloc(struct device *dev,
 	int pass = 0;
 	unsigned long align_mask;
 	unsigned long boundary_size;
+	unsigned long flags;
+	unsigned int pool_nr;
+	struct iommu_pool *pool;
 
 	align_mask = 0xffffffffffffffffl >> (64 - align_order);
 
@@ -83,36 +198,49 @@ static unsigned long iommu_range_alloc(struct device *dev,
 		return DMA_ERROR_CODE;
 	}
 
-	if (handle && *handle)
-		start = *handle;
+	if (should_fail_iommu(dev))
+		return DMA_ERROR_CODE;
+
+	/*
+	 * We don't need to disable preemption here because any CPU can
+	 * safely use any IOMMU pool.
+	 */
+	pool_nr = __raw_get_cpu_var(iommu_pool_hash) & (tbl->nr_pools - 1);
+
+	if (largealloc)
+		pool = &(tbl->large_pool);
 	else
-		start = largealloc ? tbl->it_largehint : tbl->it_hint;
+		pool = &(tbl->pools[pool_nr]);
 
-	/* Use only half of the table for small allocs (15 pages or less) */
-	limit = largealloc ? tbl->it_size : tbl->it_halfpoint;
+	spin_lock_irqsave(&(pool->lock), flags);
 
-	if (largealloc && start < tbl->it_halfpoint)
-		start = tbl->it_halfpoint;
+again:
+	if ((pass == 0) && handle && *handle)
+		start = *handle;
+	else
+		start = pool->hint;
+
+	limit = pool->end;
 
 	/* The case below can happen if we have a small segment appended
 	 * to a large, or when the previous alloc was at the very end of
 	 * the available space. If so, go back to the initial start.
 	 */
 	if (start >= limit)
-		start = largealloc ? tbl->it_largehint : tbl->it_hint;
-
- again:
+		start = pool->start;
 
 	if (limit + tbl->it_offset > mask) {
 		limit = mask - tbl->it_offset + 1;
 		/* If we're constrained on address range, first try
 		 * at the masked hint to avoid O(n) search complexity,
-		 * but on second pass, start at 0.
+		 * but on second pass, start at 0 in pool 0.
 		 */
-		if ((start & mask) >= limit || pass > 0)
-			start = 0;
-		else
+		if ((start & mask) >= limit || pass > 0) {
+			pool = &(tbl->pools[0]);
+			start = pool->start;
+		} else {
 			start &= mask;
+		}
 	}
 
 	if (dev)
@@ -126,16 +254,25 @@ static unsigned long iommu_range_alloc(struct device *dev,
 			     tbl->it_offset, boundary_size >> IOMMU_PAGE_SHIFT,
 			     align_mask);
 	if (n == -1) {
-		if (likely(pass < 2)) {
-			/* First failure, just rescan the half of the table.
-			 * Second failure, rescan the other half of the table.
-			 */
-			start = (largealloc ^ pass) ? tbl->it_halfpoint : 0;
-			limit = pass ? tbl->it_size : limit;
+		if (likely(pass == 0)) {
+			/* First try the pool from the start */
+			pool->hint = pool->start;
 			pass++;
 			goto again;
+
+		} else if (pass <= tbl->nr_pools) {
+			/* Now try scanning all the other pools */
+			spin_unlock(&(pool->lock));
+			pool_nr = (pool_nr + 1) & (tbl->nr_pools - 1);
+			pool = &tbl->pools[pool_nr];
+			spin_lock(&(pool->lock));
+			pool->hint = pool->start;
+			pass++;
+			goto again;
+
 		} else {
-			/* Third failure, give up */
+			/* Give up */
+			spin_unlock_irqrestore(&(pool->lock), flags);
 			return DMA_ERROR_CODE;
 		}
 	}
@@ -145,10 +282,10 @@ static unsigned long iommu_range_alloc(struct device *dev,
 	/* Bump the hint to a new block for small allocs. */
 	if (largealloc) {
 		/* Don't bump to new block to avoid fragmentation */
-		tbl->it_largehint = end;
+		pool->hint = end;
 	} else {
 		/* Overflow will be taken care of at the next allocation */
-		tbl->it_hint = (end + tbl->it_blocksize - 1) &
+		pool->hint = (end + tbl->it_blocksize - 1) &
 		                ~(tbl->it_blocksize - 1);
 	}
 
@@ -156,6 +293,8 @@ static unsigned long iommu_range_alloc(struct device *dev,
 	if (handle)
 		*handle = end;
 
+	spin_unlock_irqrestore(&(pool->lock), flags);
+
 	return n;
 }
 
@@ -165,18 +304,14 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 			      unsigned long mask, unsigned int align_order,
 			      struct dma_attrs *attrs)
 {
-	unsigned long entry, flags;
+	unsigned long entry;
 	dma_addr_t ret = DMA_ERROR_CODE;
 	int build_fail;
 
-	spin_lock_irqsave(&(tbl->it_lock), flags);
-
 	entry = iommu_range_alloc(dev, tbl, npages, NULL, mask, align_order);
 
-	if (unlikely(entry == DMA_ERROR_CODE)) {
-		spin_unlock_irqrestore(&(tbl->it_lock), flags);
+	if (unlikely(entry == DMA_ERROR_CODE))
 		return DMA_ERROR_CODE;
-	}
 
 	entry += tbl->it_offset;	/* Offset into real TCE table */
 	ret = entry << IOMMU_PAGE_SHIFT;	/* Set the return dma address */
@@ -193,8 +328,6 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 	 */
 	if (unlikely(build_fail)) {
 		__iommu_free(tbl, ret, npages);
-
-		spin_unlock_irqrestore(&(tbl->it_lock), flags);
 		return DMA_ERROR_CODE;
 	}
 
@@ -202,16 +335,14 @@ static dma_addr_t iommu_alloc(struct device *dev, struct iommu_table *tbl,
 	if (ppc_md.tce_flush)
 		ppc_md.tce_flush(tbl);
 
-	spin_unlock_irqrestore(&(tbl->it_lock), flags);
-
 	/* Make sure updates are seen by hardware */
 	mb();
 
 	return ret;
 }
 
-static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr, 
-			 unsigned int npages)
+static bool iommu_free_check(struct iommu_table *tbl, dma_addr_t dma_addr,
+			     unsigned int npages)
 {
 	unsigned long entry, free_entry;
 
@@ -231,20 +362,57 @@ static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 			printk(KERN_INFO "\tindex     = 0x%llx\n", (u64)tbl->it_index);
 			WARN_ON(1);
 		}
-		return;
+
+		return false;
+	}
+
+	return true;
+}
+
+static struct iommu_pool *get_pool(struct iommu_table *tbl,
+				   unsigned long entry)
+{
+	struct iommu_pool *p;
+	unsigned long largepool_start = tbl->large_pool.start;
+
+	/* The large pool is the last pool at the top of the table */
+	if (entry >= largepool_start) {
+		p = &tbl->large_pool;
+	} else {
+		unsigned int pool_nr = entry / tbl->poolsize;
+
+		BUG_ON(pool_nr > tbl->nr_pools);
+		p = &tbl->pools[pool_nr];
 	}
 
+	return p;
+}
+
+static void __iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
+			 unsigned int npages)
+{
+	unsigned long entry, free_entry;
+	unsigned long flags;
+	struct iommu_pool *pool;
+
+	entry = dma_addr >> IOMMU_PAGE_SHIFT;
+	free_entry = entry - tbl->it_offset;
+
+	pool = get_pool(tbl, free_entry);
+
+	if (!iommu_free_check(tbl, dma_addr, npages))
+		return;
+
 	ppc_md.tce_free(tbl, entry, npages);
+
+	spin_lock_irqsave(&(pool->lock), flags);
 	bitmap_clear(tbl->it_map, free_entry, npages);
+	spin_unlock_irqrestore(&(pool->lock), flags);
 }
 
 static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 		unsigned int npages)
 {
-	unsigned long flags;
-
-	spin_lock_irqsave(&(tbl->it_lock), flags);
-
 	__iommu_free(tbl, dma_addr, npages);
 
 	/* Make sure TLB cache is flushed if the HW needs it. We do
@@ -253,8 +421,6 @@ static void iommu_free(struct iommu_table *tbl, dma_addr_t dma_addr,
 	 */
 	if (ppc_md.tce_flush)
 		ppc_md.tce_flush(tbl);
-
-	spin_unlock_irqrestore(&(tbl->it_lock), flags);
 }
 
 int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
@@ -263,7 +429,6 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 		 struct dma_attrs *attrs)
 {
 	dma_addr_t dma_next = 0, dma_addr;
-	unsigned long flags;
 	struct scatterlist *s, *outs, *segstart;
 	int outcount, incount, i, build_fail = 0;
 	unsigned int align;
@@ -285,8 +450,6 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 
 	DBG("sg mapping %d elements:\n", nelems);
 
-	spin_lock_irqsave(&(tbl->it_lock), flags);
-
 	max_seg_size = dma_get_max_seg_size(dev);
 	for_each_sg(sglist, s, nelems, i) {
 		unsigned long vaddr, npages, entry, slen;
@@ -369,8 +532,6 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 	if (ppc_md.tce_flush)
 		ppc_md.tce_flush(tbl);
 
-	spin_unlock_irqrestore(&(tbl->it_lock), flags);
-
 	DBG("mapped %d elements:\n", outcount);
 
 	/* For the sake of iommu_unmap_sg, we clear out the length in the
@@ -402,7 +563,6 @@ int iommu_map_sg(struct device *dev, struct iommu_table *tbl,
 		if (s == outs)
 			break;
 	}
-	spin_unlock_irqrestore(&(tbl->it_lock), flags);
 	return 0;
 }
 
@@ -412,15 +572,12 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
 		struct dma_attrs *attrs)
 {
 	struct scatterlist *sg;
-	unsigned long flags;
 
 	BUG_ON(direction == DMA_NONE);
 
 	if (!tbl)
 		return;
 
-	spin_lock_irqsave(&(tbl->it_lock), flags);
-
 	sg = sglist;
 	while (nelems--) {
 		unsigned int npages;
@@ -440,8 +597,6 @@ void iommu_unmap_sg(struct iommu_table *tbl, struct scatterlist *sglist,
 	 */
 	if (ppc_md.tce_flush)
 		ppc_md.tce_flush(tbl);
-
-	spin_unlock_irqrestore(&(tbl->it_lock), flags);
 }
 
 static void iommu_table_clear(struct iommu_table *tbl)
@@ -494,9 +649,8 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 	unsigned long sz;
 	static int welcomed = 0;
 	struct page *page;
-
-	/* Set aside 1/4 of the table for large allocations. */
-	tbl->it_halfpoint = tbl->it_size * 3 / 4;
+	unsigned int i;
+	struct iommu_pool *p;
 
 	/* number of bytes needed for the bitmap */
 	sz = (tbl->it_size + 7) >> 3;
@@ -515,9 +669,28 @@ struct iommu_table *iommu_init_table(struct iommu_table *tbl, int nid)
 	if (tbl->it_offset == 0)
 		set_bit(0, tbl->it_map);
 
-	tbl->it_hint = 0;
-	tbl->it_largehint = tbl->it_halfpoint;
-	spin_lock_init(&tbl->it_lock);
+	/* We only split the IOMMU table if we have 1GB or more of space */
+	if ((tbl->it_size << IOMMU_PAGE_SHIFT) >= (1UL * 1024 * 1024 * 1024))
+		tbl->nr_pools = IOMMU_NR_POOLS;
+	else
+		tbl->nr_pools = 1;
+
+	/* We reserve the top 1/4 of the table for large allocations */
+	tbl->poolsize = (tbl->it_size * 3 / 4) / tbl->nr_pools;
+
+	for (i = 0; i < tbl->nr_pools; i++) {
+		p = &tbl->pools[i];
+		spin_lock_init(&(p->lock));
+		p->start = tbl->poolsize * i;
+		p->hint = p->start;
+		p->end = p->start + tbl->poolsize;
+	}
+
+	p = &tbl->large_pool;
+	spin_lock_init(&(p->lock));
+	p->start = tbl->poolsize * i;
+	p->hint = p->start;
+	p->end = tbl->it_size;
 
 	iommu_table_clear(tbl);