summary refs log tree commit diff
path: root/arch/x86
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2014-10-11 20:29:01 -0400
committerLinus Torvalds <torvalds@linux-foundation.org>2014-10-11 20:29:01 -0400
commit81ae31d78239318610d7c2acb3e2610d622a5aa4 (patch)
tree1e31b300f1574fceaff065a9bd92460b7c466f7c /arch/x86
parentef4a48c513211d842c55e84f7a1c31884b91dcf7 (diff)
parent95afae481414cbdb0567bf82d5e5077c3ac9da20 (diff)
downloadlinux-81ae31d78239318610d7c2acb3e2610d622a5aa4.tar.gz
Merge tag 'stable/for-linus-3.18-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip
Pull Xen updates from David Vrabel:
 "Features and fixes:

   - Add pvscsi frontend and backend drivers.
   - Remove _PAGE_IOMAP PTE flag, freeing it for alternate uses.
   - Try and keep memory contiguous during PV memory setup (reduces
     SWIOTLB usage).
   - Allow front/back drivers to use threaded irqs.
   - Support large initrds in PV guests.
   - Fix PVH guests in preparation for Xen 4.5"

* tag 'stable/for-linus-3.18-rc0-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/xen/tip: (22 commits)
  xen: remove DEFINE_XENBUS_DRIVER() macro
  xen/xenbus: Remove BUG_ON() when error string trucated
  xen/xenbus: Correct the comments for xenbus_grant_ring()
  x86/xen: Set EFER.NX and EFER.SCE in PVH guests
  xen: eliminate scalability issues from initrd handling
  xen: sync some headers with xen tree
  xen: make pvscsi frontend dependant on xenbus frontend
  arm{,64}/xen: Remove "EXPERIMENTAL" in the description of the Xen options
  xen-scsifront: don't deadlock if the ring becomes full
  x86: remove the Xen-specific _PAGE_IOMAP PTE flag
  x86/xen: do not use _PAGE_IOMAP PTE flag for I/O mappings
  x86: skip check for spurious faults for non-present faults
  xen/efi: Directly include needed headers
  xen-scsiback: clean up a type issue in scsiback_make_tpg()
  xen-scsifront: use GFP_ATOMIC under spin_lock
  MAINTAINERS: Add xen pvscsi maintainer
  xen-scsiback: Add Xen PV SCSI backend driver
  xen-scsifront: Add Xen PV SCSI frontend driver
  xen: Add Xen pvSCSI protocol description
  xen/events: support threaded irqs for interdomain event channels
  ...
Diffstat (limited to 'arch/x86')
-rw-r--r--arch/x86/include/asm/pgtable_types.h11
-rw-r--r--arch/x86/mm/fault.c22
-rw-r--r--arch/x86/mm/init_32.c2
-rw-r--r--arch/x86/mm/init_64.c2
-rw-r--r--arch/x86/pci/i386.c2
-rw-r--r--arch/x86/xen/efi.c2
-rw-r--r--arch/x86/xen/enlighten.c19
-rw-r--r--arch/x86/xen/mmu.c48
-rw-r--r--arch/x86/xen/p2m.c23
-rw-r--r--arch/x86/xen/p2m.h15
-rw-r--r--arch/x86/xen/setup.c370
-rw-r--r--arch/x86/xen/smp.c29
-rw-r--r--arch/x86/xen/smp.h8
-rw-r--r--arch/x86/xen/xen-head.S36
14 files changed, 424 insertions, 165 deletions
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 0f9724c9c510..07789647bf33 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -23,7 +23,6 @@
 #define _PAGE_BIT_SPECIAL	_PAGE_BIT_SOFTW1
 #define _PAGE_BIT_CPA_TEST	_PAGE_BIT_SOFTW1
 #define _PAGE_BIT_SPLITTING	_PAGE_BIT_SOFTW2 /* only valid on a PSE pmd */
-#define _PAGE_BIT_IOMAP		_PAGE_BIT_SOFTW2 /* flag used to indicate IO mapping */
 #define _PAGE_BIT_HIDDEN	_PAGE_BIT_SOFTW3 /* hidden by kmemcheck */
 #define _PAGE_BIT_SOFT_DIRTY	_PAGE_BIT_SOFTW3 /* software dirty tracking */
 #define _PAGE_BIT_NX           63       /* No execute: only valid after cpuid check */
@@ -52,7 +51,7 @@
 #define _PAGE_PSE	(_AT(pteval_t, 1) << _PAGE_BIT_PSE)
 #define _PAGE_GLOBAL	(_AT(pteval_t, 1) << _PAGE_BIT_GLOBAL)
 #define _PAGE_SOFTW1	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW1)
-#define _PAGE_IOMAP	(_AT(pteval_t, 1) << _PAGE_BIT_IOMAP)
+#define _PAGE_SOFTW2	(_AT(pteval_t, 1) << _PAGE_BIT_SOFTW2)
 #define _PAGE_PAT	(_AT(pteval_t, 1) << _PAGE_BIT_PAT)
 #define _PAGE_PAT_LARGE (_AT(pteval_t, 1) << _PAGE_BIT_PAT_LARGE)
 #define _PAGE_SPECIAL	(_AT(pteval_t, 1) << _PAGE_BIT_SPECIAL)
@@ -168,10 +167,10 @@
 #define __PAGE_KERNEL_LARGE_NOCACHE	(__PAGE_KERNEL | _PAGE_CACHE_UC | _PAGE_PSE)
 #define __PAGE_KERNEL_LARGE_EXEC	(__PAGE_KERNEL_EXEC | _PAGE_PSE)
 
-#define __PAGE_KERNEL_IO		(__PAGE_KERNEL | _PAGE_IOMAP)
-#define __PAGE_KERNEL_IO_NOCACHE	(__PAGE_KERNEL_NOCACHE | _PAGE_IOMAP)
-#define __PAGE_KERNEL_IO_UC_MINUS	(__PAGE_KERNEL_UC_MINUS | _PAGE_IOMAP)
-#define __PAGE_KERNEL_IO_WC		(__PAGE_KERNEL_WC | _PAGE_IOMAP)
+#define __PAGE_KERNEL_IO		(__PAGE_KERNEL)
+#define __PAGE_KERNEL_IO_NOCACHE	(__PAGE_KERNEL_NOCACHE)
+#define __PAGE_KERNEL_IO_UC_MINUS	(__PAGE_KERNEL_UC_MINUS)
+#define __PAGE_KERNEL_IO_WC		(__PAGE_KERNEL_WC)
 
 #define PAGE_KERNEL			__pgprot(__PAGE_KERNEL)
 #define PAGE_KERNEL_RO			__pgprot(__PAGE_KERNEL_RO)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index a24194681513..83bb03bfa259 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -933,8 +933,17 @@ static int spurious_fault_check(unsigned long error_code, pte_t *pte)
  * cross-processor TLB flush, even if no stale TLB entries exist
  * on other processors.
  *
+ * Spurious faults may only occur if the TLB contains an entry with
+ * fewer permission than the page table entry.  Non-present (P = 0)
+ * and reserved bit (R = 1) faults are never spurious.
+ *
  * There are no security implications to leaving a stale TLB when
  * increasing the permissions on a page.
+ *
+ * Returns non-zero if a spurious fault was handled, zero otherwise.
+ *
+ * See Intel Developer's Manual Vol 3 Section 4.10.4.3, bullet 3
+ * (Optional Invalidation).
  */
 static noinline int
 spurious_fault(unsigned long error_code, unsigned long address)
@@ -945,8 +954,17 @@ spurious_fault(unsigned long error_code, unsigned long address)
 	pte_t *pte;
 	int ret;
 
-	/* Reserved-bit violation or user access to kernel space? */
-	if (error_code & (PF_USER | PF_RSVD))
+	/*
+	 * Only writes to RO or instruction fetches from NX may cause
+	 * spurious faults.
+	 *
+	 * These could be from user or supervisor accesses but the TLB
+	 * is only lazily flushed after a kernel mapping protection
+	 * change, so user accesses are not expected to cause spurious
+	 * faults.
+	 */
+	if (error_code != (PF_WRITE | PF_PROT)
+	    && error_code != (PF_INSTR | PF_PROT))
 		return 0;
 
 	pgd = init_mm.pgd + pgd_index(address);
diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c
index 7d05565ba781..c8140e12816a 100644
--- a/arch/x86/mm/init_32.c
+++ b/arch/x86/mm/init_32.c
@@ -537,7 +537,7 @@ static void __init pagetable_init(void)
 	permanent_kmaps_init(pgd_base);
 }
 
-pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL | _PAGE_IOMAP);
+pteval_t __supported_pte_mask __read_mostly = ~(_PAGE_NX | _PAGE_GLOBAL);
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
 /* user-defined highmem size */
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 5621c47d7a1a..5d984769cbd8 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -151,7 +151,7 @@ early_param("gbpages", parse_direct_gbpages_on);
  * around without checking the pgd every time.
  */
 
-pteval_t __supported_pte_mask __read_mostly = ~_PAGE_IOMAP;
+pteval_t __supported_pte_mask __read_mostly = ~0;
 EXPORT_SYMBOL_GPL(__supported_pte_mask);
 
 int force_personality32;
diff --git a/arch/x86/pci/i386.c b/arch/x86/pci/i386.c
index 2ae525e0d8ba..37c1435889ce 100644
--- a/arch/x86/pci/i386.c
+++ b/arch/x86/pci/i386.c
@@ -442,8 +442,6 @@ int pci_mmap_page_range(struct pci_dev *dev, struct vm_area_struct *vma,
 		 */
 		prot |= _PAGE_CACHE_UC_MINUS;
 
-	prot |= _PAGE_IOMAP;	/* creating a mapping for IO */
-
 	vma->vm_page_prot = __pgprot(prot);
 
 	if (io_remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
diff --git a/arch/x86/xen/efi.c b/arch/x86/xen/efi.c
index a02e09e18f57..be14cc3e48d5 100644
--- a/arch/x86/xen/efi.c
+++ b/arch/x86/xen/efi.c
@@ -15,12 +15,14 @@
  * with this program.  If not, see <http://www.gnu.org/licenses/>.
  */
 
+#include <linux/bitops.h>
 #include <linux/efi.h>
 #include <linux/init.h>
 #include <linux/string.h>
 
 #include <xen/xen-ops.h>
 
+#include <asm/page.h>
 #include <asm/setup.h>
 
 void __init xen_efi_init(void)
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index c0cb11fb5008..acb0effd8077 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -1463,6 +1463,7 @@ static void __ref xen_setup_gdt(int cpu)
 	pv_cpu_ops.load_gdt = xen_load_gdt;
 }
 
+#ifdef CONFIG_XEN_PVH
 /*
  * A PV guest starts with default flags that are not set for PVH, set them
  * here asap.
@@ -1508,17 +1509,21 @@ static void __init xen_pvh_early_guest_init(void)
 		return;
 
 	xen_have_vector_callback = 1;
+
+	xen_pvh_early_cpu_init(0, false);
 	xen_pvh_set_cr_flags(0);
 
 #ifdef CONFIG_X86_32
 	BUG(); /* PVH: Implement proper support. */
 #endif
 }
+#endif    /* CONFIG_XEN_PVH */
 
 /* First C function to be called on Xen boot */
 asmlinkage __visible void __init xen_start_kernel(void)
 {
 	struct physdev_set_iopl set_iopl;
+	unsigned long initrd_start = 0;
 	int rc;
 
 	if (!xen_start_info)
@@ -1527,7 +1532,9 @@ asmlinkage __visible void __init xen_start_kernel(void)
 	xen_domain_type = XEN_PV_DOMAIN;
 
 	xen_setup_features();
+#ifdef CONFIG_XEN_PVH
 	xen_pvh_early_guest_init();
+#endif
 	xen_setup_machphys_mapping();
 
 	/* Install Xen paravirt ops */
@@ -1559,8 +1566,6 @@ asmlinkage __visible void __init xen_start_kernel(void)
 #endif
 		__supported_pte_mask &= ~(_PAGE_PWT | _PAGE_PCD);
 
-	__supported_pte_mask |= _PAGE_IOMAP;
-
 	/*
 	 * Prevent page tables from being allocated in highmem, even
 	 * if CONFIG_HIGHPTE is enabled.
@@ -1667,10 +1672,16 @@ asmlinkage __visible void __init xen_start_kernel(void)
 	new_cpu_data.x86_capability[0] = cpuid_edx(1);
 #endif
 
+	if (xen_start_info->mod_start) {
+	    if (xen_start_info->flags & SIF_MOD_START_PFN)
+		initrd_start = PFN_PHYS(xen_start_info->mod_start);
+	    else
+		initrd_start = __pa(xen_start_info->mod_start);
+	}
+
 	/* Poke various useful things into boot_params */
 	boot_params.hdr.type_of_loader = (9 << 4) | 0;
-	boot_params.hdr.ramdisk_image = xen_start_info->mod_start
-		? __pa(xen_start_info->mod_start) : 0;
+	boot_params.hdr.ramdisk_image = initrd_start;
 	boot_params.hdr.ramdisk_size = xen_start_info->mod_len;
 	boot_params.hdr.cmd_line_ptr = __pa(xen_start_info->cmd_line);
 
diff --git a/arch/x86/xen/mmu.c b/arch/x86/xen/mmu.c
index 16fb0099b7f2..f62af7647ec9 100644
--- a/arch/x86/xen/mmu.c
+++ b/arch/x86/xen/mmu.c
@@ -399,38 +399,14 @@ static pteval_t pte_pfn_to_mfn(pteval_t val)
 		if (unlikely(mfn == INVALID_P2M_ENTRY)) {
 			mfn = 0;
 			flags = 0;
-		} else {
-			/*
-			 * Paramount to do this test _after_ the
-			 * INVALID_P2M_ENTRY as INVALID_P2M_ENTRY &
-			 * IDENTITY_FRAME_BIT resolves to true.
-			 */
-			mfn &= ~FOREIGN_FRAME_BIT;
-			if (mfn & IDENTITY_FRAME_BIT) {
-				mfn &= ~IDENTITY_FRAME_BIT;
-				flags |= _PAGE_IOMAP;
-			}
-		}
+		} else
+			mfn &= ~(FOREIGN_FRAME_BIT | IDENTITY_FRAME_BIT);
 		val = ((pteval_t)mfn << PAGE_SHIFT) | flags;
 	}
 
 	return val;
 }
 
-static pteval_t iomap_pte(pteval_t val)
-{
-	if (val & _PAGE_PRESENT) {
-		unsigned long pfn = (val & PTE_PFN_MASK) >> PAGE_SHIFT;
-		pteval_t flags = val & PTE_FLAGS_MASK;
-
-		/* We assume the pte frame number is a MFN, so
-		   just use it as-is. */
-		val = ((pteval_t)pfn << PAGE_SHIFT) | flags;
-	}
-
-	return val;
-}
-
 __visible pteval_t xen_pte_val(pte_t pte)
 {
 	pteval_t pteval = pte.pte;
@@ -441,9 +417,6 @@ __visible pteval_t xen_pte_val(pte_t pte)
 		pteval = (pteval & ~_PAGE_PAT) | _PAGE_PWT;
 	}
 #endif
-	if (xen_initial_domain() && (pteval & _PAGE_IOMAP))
-		return pteval;
-
 	return pte_mfn_to_pfn(pteval);
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_pte_val);
@@ -481,7 +454,6 @@ void xen_set_pat(u64 pat)
 
 __visible pte_t xen_make_pte(pteval_t pte)
 {
-	phys_addr_t addr = (pte & PTE_PFN_MASK);
 #if 0
 	/* If Linux is trying to set a WC pte, then map to the Xen WC.
 	 * If _PAGE_PAT is set, then it probably means it is really
@@ -496,19 +468,7 @@ __visible pte_t xen_make_pte(pteval_t pte)
 			pte = (pte & ~(_PAGE_PCD | _PAGE_PWT)) | _PAGE_PAT;
 	}
 #endif
-	/*
-	 * Unprivileged domains are allowed to do IOMAPpings for
-	 * PCI passthrough, but not map ISA space.  The ISA
-	 * mappings are just dummy local mappings to keep other
-	 * parts of the kernel happy.
-	 */
-	if (unlikely(pte & _PAGE_IOMAP) &&
-	    (xen_initial_domain() || addr >= ISA_END_ADDRESS)) {
-		pte = iomap_pte(pte);
-	} else {
-		pte &= ~_PAGE_IOMAP;
-		pte = pte_pfn_to_mfn(pte);
-	}
+	pte = pte_pfn_to_mfn(pte);
 
 	return native_make_pte(pte);
 }
@@ -2091,7 +2051,7 @@ static void xen_set_fixmap(unsigned idx, phys_addr_t phys, pgprot_t prot)
 
 	default:
 		/* By default, set_fixmap is used for hardware mappings */
-		pte = mfn_pte(phys, __pgprot(pgprot_val(prot) | _PAGE_IOMAP));
+		pte = mfn_pte(phys, prot);
 		break;
 	}
 
diff --git a/arch/x86/xen/p2m.c b/arch/x86/xen/p2m.c
index 3172692381ae..9f5983b01ed9 100644
--- a/arch/x86/xen/p2m.c
+++ b/arch/x86/xen/p2m.c
@@ -173,6 +173,7 @@
 #include <xen/balloon.h>
 #include <xen/grant_table.h>
 
+#include "p2m.h"
 #include "multicalls.h"
 #include "xen-ops.h"
 
@@ -180,12 +181,6 @@ static void __init m2p_override_init(void);
 
 unsigned long xen_max_p2m_pfn __read_mostly;
 
-#define P2M_PER_PAGE		(PAGE_SIZE / sizeof(unsigned long))
-#define P2M_MID_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long *))
-#define P2M_TOP_PER_PAGE	(PAGE_SIZE / sizeof(unsigned long **))
-
-#define MAX_P2M_PFN		(P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
-
 /* Placeholders for holes in the address space */
 static RESERVE_BRK_ARRAY(unsigned long, p2m_missing, P2M_PER_PAGE);
 static RESERVE_BRK_ARRAY(unsigned long *, p2m_mid_missing, P2M_MID_PER_PAGE);
@@ -202,16 +197,12 @@ static RESERVE_BRK_ARRAY(unsigned long, p2m_mid_identity_mfn, P2M_MID_PER_PAGE);
 RESERVE_BRK(p2m_mid, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
 RESERVE_BRK(p2m_mid_mfn, PAGE_SIZE * (MAX_DOMAIN_PAGES / (P2M_PER_PAGE * P2M_MID_PER_PAGE)));
 
-/* We might hit two boundary violations at the start and end, at max each
- * boundary violation will require three middle nodes. */
-RESERVE_BRK(p2m_mid_extra, PAGE_SIZE * 2 * 3);
-
-/* When we populate back during bootup, the amount of pages can vary. The
- * max we have is seen is 395979, but that does not mean it can't be more.
- * Some machines can have 3GB I/O holes even. With early_can_reuse_p2m_middle
- * it can re-use Xen provided mfn_list array, so we only need to allocate at
- * most three P2M top nodes. */
-RESERVE_BRK(p2m_populated, PAGE_SIZE * 3);
+/* For each I/O range remapped we may lose up to two leaf pages for the boundary
+ * violations and three mid pages to cover up to 3GB. With
+ * early_can_reuse_p2m_middle() most of the leaf pages will be reused by the
+ * remapped region.
+ */
+RESERVE_BRK(p2m_identity_remap, PAGE_SIZE * 2 * 3 * MAX_REMAP_RANGES);
 
 static inline unsigned p2m_top_index(unsigned long pfn)
 {
diff --git a/arch/x86/xen/p2m.h b/arch/x86/xen/p2m.h
new file mode 100644
index 000000000000..ad8aee24ab72
--- /dev/null
+++ b/arch/x86/xen/p2m.h
@@ -0,0 +1,15 @@
+#ifndef _XEN_P2M_H
+#define _XEN_P2M_H
+
+#define P2M_PER_PAGE        (PAGE_SIZE / sizeof(unsigned long))
+#define P2M_MID_PER_PAGE    (PAGE_SIZE / sizeof(unsigned long *))
+#define P2M_TOP_PER_PAGE    (PAGE_SIZE / sizeof(unsigned long **))
+
+#define MAX_P2M_PFN         (P2M_TOP_PER_PAGE * P2M_MID_PER_PAGE * P2M_PER_PAGE)
+
+#define MAX_REMAP_RANGES    10
+
+extern unsigned long __init set_phys_range_identity(unsigned long pfn_s,
+                                      unsigned long pfn_e);
+
+#endif  /* _XEN_P2M_H */
diff --git a/arch/x86/xen/setup.c b/arch/x86/xen/setup.c
index 2e555163c2fe..af7216128d93 100644
--- a/arch/x86/xen/setup.c
+++ b/arch/x86/xen/setup.c
@@ -29,6 +29,7 @@
 #include <xen/features.h>
 #include "xen-ops.h"
 #include "vdso.h"
+#include "p2m.h"
 
 /* These are code, but not functions.  Defined in entry.S */
 extern const char xen_hypervisor_callback[];
@@ -46,6 +47,9 @@ struct xen_memory_region xen_extra_mem[XEN_EXTRA_MEM_MAX_REGIONS] __initdata;
 /* Number of pages released from the initial allocation. */
 unsigned long xen_released_pages;
 
+/* Buffer used to remap identity mapped pages */
+unsigned long xen_remap_buf[P2M_PER_PAGE] __initdata;
+
 /* 
  * The maximum amount of extra memory compared to the base size.  The
  * main scaling factor is the size of struct page.  At extreme ratios
@@ -151,107 +155,325 @@ static unsigned long __init xen_do_chunk(unsigned long start,
 	return len;
 }
 
-static unsigned long __init xen_release_chunk(unsigned long start,
-					      unsigned long end)
-{
-	return xen_do_chunk(start, end, true);
-}
-
-static unsigned long __init xen_populate_chunk(
+/*
+ * Finds the next RAM pfn available in the E820 map after min_pfn.
+ * This function updates min_pfn with the pfn found and returns
+ * the size of that range or zero if not found.
+ */
+static unsigned long __init xen_find_pfn_range(
 	const struct e820entry *list, size_t map_size,
-	unsigned long max_pfn, unsigned long *last_pfn,
-	unsigned long credits_left)
+	unsigned long *min_pfn)
 {
 	const struct e820entry *entry;
 	unsigned int i;
 	unsigned long done = 0;
-	unsigned long dest_pfn;
 
 	for (i = 0, entry = list; i < map_size; i++, entry++) {
 		unsigned long s_pfn;
 		unsigned long e_pfn;
-		unsigned long pfns;
-		long capacity;
-
-		if (credits_left <= 0)
-			break;
 
 		if (entry->type != E820_RAM)
 			continue;
 
 		e_pfn = PFN_DOWN(entry->addr + entry->size);
 
-		/* We only care about E820 after the xen_start_info->nr_pages */
-		if (e_pfn <= max_pfn)
+		/* We only care about E820 after this */
+		if (e_pfn < *min_pfn)
 			continue;
 
 		s_pfn = PFN_UP(entry->addr);
-		/* If the E820 falls within the nr_pages, we want to start
-		 * at the nr_pages PFN.
-		 * If that would mean going past the E820 entry, skip it
+
+		/* If min_pfn falls within the E820 entry, we want to start
+		 * at the min_pfn PFN.
 		 */
-		if (s_pfn <= max_pfn) {
-			capacity = e_pfn - max_pfn;
-			dest_pfn = max_pfn;
+		if (s_pfn <= *min_pfn) {
+			done = e_pfn - *min_pfn;
 		} else {
-			capacity = e_pfn - s_pfn;
-			dest_pfn = s_pfn;
+			done = e_pfn - s_pfn;
+			*min_pfn = s_pfn;
 		}
+		break;
+	}
 
-		if (credits_left < capacity)
-			capacity = credits_left;
+	return done;
+}
 
-		pfns = xen_do_chunk(dest_pfn, dest_pfn + capacity, false);
-		done += pfns;
-		*last_pfn = (dest_pfn + pfns);
-		if (pfns < capacity)
-			break;
-		credits_left -= pfns;
+/*
+ * This releases a chunk of memory and then does the identity map. It's used as
+ * as a fallback if the remapping fails.
+ */
+static void __init xen_set_identity_and_release_chunk(unsigned long start_pfn,
+	unsigned long end_pfn, unsigned long nr_pages, unsigned long *identity,
+	unsigned long *released)
+{
+	WARN_ON(start_pfn > end_pfn);
+
+	/* Need to release pages first */
+	*released += xen_do_chunk(start_pfn, min(end_pfn, nr_pages), true);
+	*identity += set_phys_range_identity(start_pfn, end_pfn);
+}
+
+/*
+ * Helper function to update both the p2m and m2p tables.
+ */
+static unsigned long __init xen_update_mem_tables(unsigned long pfn,
+						  unsigned long mfn)
+{
+	struct mmu_update update = {
+		.ptr = ((unsigned long long)mfn << PAGE_SHIFT) | MMU_MACHPHYS_UPDATE,
+		.val = pfn
+	};
+
+	/* Update p2m */
+	if (!early_set_phys_to_machine(pfn, mfn)) {
+		WARN(1, "Failed to set p2m mapping for pfn=%ld mfn=%ld\n",
+		     pfn, mfn);
+		return false;
 	}
-	return done;
+
+	/* Update m2p */
+	if (HYPERVISOR_mmu_update(&update, 1, NULL, DOMID_SELF) < 0) {
+		WARN(1, "Failed to set m2p mapping for mfn=%ld pfn=%ld\n",
+		     mfn, pfn);
+		return false;
+	}
+
+	return true;
 }
 
-static void __init xen_set_identity_and_release_chunk(
-	unsigned long start_pfn, unsigned long end_pfn, unsigned long nr_pages,
-	unsigned long *released, unsigned long *identity)
+/*
+ * This function updates the p2m and m2p tables with an identity map from
+ * start_pfn to start_pfn+size and remaps the underlying RAM of the original
+ * allocation at remap_pfn. It must do so carefully in P2M_PER_PAGE sized blocks
+ * to not exhaust the reserved brk space. Doing it in properly aligned blocks
+ * ensures we only allocate the minimum required leaf pages in the p2m table. It
+ * copies the existing mfns from the p2m table under the 1:1 map, overwrites
+ * them with the identity map and then updates the p2m and m2p tables with the
+ * remapped memory.
+ */
+static unsigned long __init xen_do_set_identity_and_remap_chunk(
+        unsigned long start_pfn, unsigned long size, unsigned long remap_pfn)
 {
-	unsigned long pfn;
+	unsigned long ident_pfn_iter, remap_pfn_iter;
+	unsigned long ident_start_pfn_align, remap_start_pfn_align;
+	unsigned long ident_end_pfn_align, remap_end_pfn_align;
+	unsigned long ident_boundary_pfn, remap_boundary_pfn;
+	unsigned long ident_cnt = 0;
+	unsigned long remap_cnt = 0;
+	unsigned long left = size;
+	unsigned long mod;
+	int i;
+
+	WARN_ON(size == 0);
+
+	BUG_ON(xen_feature(XENFEAT_auto_translated_physmap));
 
 	/*
-	 * If the PFNs are currently mapped, clear the mappings
-	 * (except for the ISA region which must be 1:1 mapped) to
-	 * release the refcounts (in Xen) on the original frames.
+	 * Determine the proper alignment to remap memory in P2M_PER_PAGE sized
+	 * blocks. We need to keep track of both the existing pfn mapping and
+	 * the new pfn remapping.
 	 */
-	for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++) {
-		pte_t pte = __pte_ma(0);
+	mod = start_pfn % P2M_PER_PAGE;
+	ident_start_pfn_align =
+		mod ? (start_pfn - mod + P2M_PER_PAGE) : start_pfn;
+	mod = remap_pfn % P2M_PER_PAGE;
+	remap_start_pfn_align =
+		mod ? (remap_pfn - mod + P2M_PER_PAGE) : remap_pfn;
+	mod = (start_pfn + size) % P2M_PER_PAGE;
+	ident_end_pfn_align = start_pfn + size - mod;
+	mod = (remap_pfn + size) % P2M_PER_PAGE;
+	remap_end_pfn_align = remap_pfn + size - mod;
+
+	/* Iterate over each p2m leaf node in each range */
+	for (ident_pfn_iter = ident_start_pfn_align, remap_pfn_iter = remap_start_pfn_align;
+	     ident_pfn_iter < ident_end_pfn_align && remap_pfn_iter < remap_end_pfn_align;
+	     ident_pfn_iter += P2M_PER_PAGE, remap_pfn_iter += P2M_PER_PAGE) {
+		/* Check we aren't past the end */
+		BUG_ON(ident_pfn_iter + P2M_PER_PAGE > start_pfn + size);
+		BUG_ON(remap_pfn_iter + P2M_PER_PAGE > remap_pfn + size);
+
+		/* Save p2m mappings */
+		for (i = 0; i < P2M_PER_PAGE; i++)
+			xen_remap_buf[i] = pfn_to_mfn(ident_pfn_iter + i);
+
+		/* Set identity map which will free a p2m leaf */
+		ident_cnt += set_phys_range_identity(ident_pfn_iter,
+			ident_pfn_iter + P2M_PER_PAGE);
+
+#ifdef DEBUG
+		/* Helps verify a p2m leaf has been freed */
+		for (i = 0; i < P2M_PER_PAGE; i++) {
+			unsigned int pfn = ident_pfn_iter + i;
+			BUG_ON(pfn_to_mfn(pfn) != pfn);
+		}
+#endif
+		/* Now remap memory */
+		for (i = 0; i < P2M_PER_PAGE; i++) {
+			unsigned long mfn = xen_remap_buf[i];
+
+			/* This will use the p2m leaf freed above */
+			if (!xen_update_mem_tables(remap_pfn_iter + i, mfn)) {
+				WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n",
+					remap_pfn_iter + i, mfn);
+				return 0;
+			}
+
+			remap_cnt++;
+		}
 
-		if (pfn < PFN_UP(ISA_END_ADDRESS))
-			pte = mfn_pte(pfn, PAGE_KERNEL_IO);
+		left -= P2M_PER_PAGE;
+	}
 
-		(void)HYPERVISOR_update_va_mapping(
-			(unsigned long)__va(pfn << PAGE_SHIFT), pte, 0);
+	/* Max boundary space possible */
+	BUG_ON(left > (P2M_PER_PAGE - 1) * 2);
+
+	/* Now handle the boundary conditions */
+	ident_boundary_pfn = start_pfn;
+	remap_boundary_pfn = remap_pfn;
+	for (i = 0; i < left; i++) {
+		unsigned long mfn;
+
+		/* These two checks move from the start to end boundaries */
+		if (ident_boundary_pfn == ident_start_pfn_align)
+			ident_boundary_pfn = ident_pfn_iter;
+		if (remap_boundary_pfn == remap_start_pfn_align)
+			remap_boundary_pfn = remap_pfn_iter;
+
+		/* Check we aren't past the end */
+		BUG_ON(ident_boundary_pfn >= start_pfn + size);
+		BUG_ON(remap_boundary_pfn >= remap_pfn + size);
+
+		mfn = pfn_to_mfn(ident_boundary_pfn);
+
+		if (!xen_update_mem_tables(remap_boundary_pfn, mfn)) {
+			WARN(1, "Failed to update mem mapping for pfn=%ld mfn=%ld\n",
+				remap_pfn_iter + i, mfn);
+			return 0;
+		}
+		remap_cnt++;
+
+		ident_boundary_pfn++;
+		remap_boundary_pfn++;
 	}
 
-	if (start_pfn < nr_pages)
-		*released += xen_release_chunk(
-			start_pfn, min(end_pfn, nr_pages));
+	/* Finish up the identity map */
+	if (ident_start_pfn_align >= ident_end_pfn_align) {
+		/*
+                 * In this case we have an identity range which does not span an
+                 * aligned block so everything needs to be identity mapped here.
+                 * If we didn't check this we might remap too many pages since
+                 * the align boundaries are not meaningful in this case.
+	         */
+		ident_cnt += set_phys_range_identity(start_pfn,
+			start_pfn + size);
+	} else {
+		/* Remapped above so check each end of the chunk */
+		if (start_pfn < ident_start_pfn_align)
+			ident_cnt += set_phys_range_identity(start_pfn,
+				ident_start_pfn_align);
+		if (start_pfn + size > ident_pfn_iter)
+			ident_cnt += set_phys_range_identity(ident_pfn_iter,
+				start_pfn + size);
+	}
 
-	*identity += set_phys_range_identity(start_pfn, end_pfn);
+	BUG_ON(ident_cnt != size);
+	BUG_ON(remap_cnt != size);
+
+	return size;
 }
 
-static unsigned long __init xen_set_identity_and_release(
-	const struct e820entry *list, size_t map_size, unsigned long nr_pages)
+/*
+ * This function takes a contiguous pfn range that needs to be identity mapped
+ * and:
+ *
+ *  1) Finds a new range of pfns to use to remap based on E820 and remap_pfn.
+ *  2) Calls the do_ function to actually do the mapping/remapping work.
+ *
+ * The goal is to not allocate additional memory but to remap the existing
+ * pages. In the case of an error the underlying memory is simply released back
+ * to Xen and not remapped.
+ */
+static unsigned long __init xen_set_identity_and_remap_chunk(
+        const struct e820entry *list, size_t map_size, unsigned long start_pfn,
+	unsigned long end_pfn, unsigned long nr_pages, unsigned long remap_pfn,
+	unsigned long *identity, unsigned long *remapped,
+	unsigned long *released)
+{
+	unsigned long pfn;
+	unsigned long i = 0;
+	unsigned long n = end_pfn - start_pfn;
+
+	while (i < n) {
+		unsigned long cur_pfn = start_pfn + i;
+		unsigned long left = n - i;
+		unsigned long size = left;
+		unsigned long remap_range_size;
+
+		/* Do not remap pages beyond the current allocation */
+		if (cur_pfn >= nr_pages) {
+			/* Identity map remaining pages */
+			*identity += set_phys_range_identity(cur_pfn,
+				cur_pfn + size);
+			break;
+		}
+		if (cur_pfn + size > nr_pages)
+			size = nr_pages - cur_pfn;
+
+		remap_range_size = xen_find_pfn_range(list, map_size,
+						      &remap_pfn);
+		if (!remap_range_size) {
+			pr_warning("Unable to find available pfn range, not remapping identity pages\n");
+			xen_set_identity_and_release_chunk(cur_pfn,
+				cur_pfn + left, nr_pages, identity, released);
+			break;
+		}
+		/* Adjust size to fit in current e820 RAM region */
+		if (size > remap_range_size)
+			size = remap_range_size;
+
+		if (!xen_do_set_identity_and_remap_chunk(cur_pfn, size, remap_pfn)) {
+			WARN(1, "Failed to remap 1:1 memory cur_pfn=%ld size=%ld remap_pfn=%ld\n",
+				cur_pfn, size, remap_pfn);
+			xen_set_identity_and_release_chunk(cur_pfn,
+				cur_pfn + left, nr_pages, identity, released);
+			break;
+		}
+
+		/* Update variables to reflect new mappings. */
+		i += size;
+		remap_pfn += size;
+		*identity += size;
+		*remapped += size;
+	}
+
+	/*
+	 * If the PFNs are currently mapped, the VA mapping also needs
+	 * to be updated to be 1:1.
+	 */
+	for (pfn = start_pfn; pfn <= max_pfn_mapped && pfn < end_pfn; pfn++)
+		(void)HYPERVISOR_update_va_mapping(
+			(unsigned long)__va(pfn << PAGE_SHIFT),
+			mfn_pte(pfn, PAGE_KERNEL_IO), 0);
+
+	return remap_pfn;
+}
+
+static unsigned long __init xen_set_identity_and_remap(
+	const struct e820entry *list, size_t map_size, unsigned long nr_pages,
+	unsigned long *released)
 {
 	phys_addr_t start = 0;
-	unsigned long released = 0;
 	unsigned long identity = 0;
+	unsigned long remapped = 0;
+	unsigned long last_pfn = nr_pages;
 	const struct e820entry *entry;
+	unsigned long num_released = 0;
 	int i;
 
 	/*
 	 * Combine non-RAM regions and gaps until a RAM region (or the
 	 * end of the map) is reached, then set the 1:1 map and
-	 * release the pages (if available) in those non-RAM regions.
+	 * remap the memory in those non-RAM regions.
 	 *
 	 * The combined non-RAM regions are rounded to a whole number
 	 * of pages so any partial pages are accessible via the 1:1
@@ -269,22 +491,24 @@ static unsigned long __init xen_set_identity_and_release(
 				end_pfn = PFN_UP(entry->addr);
 
 			if (start_pfn < end_pfn)
-				xen_set_identity_and_release_chunk(
-					start_pfn, end_pfn, nr_pages,
-					&released, &identity);
-
+				last_pfn = xen_set_identity_and_remap_chunk(
+						list, map_size, start_pfn,
+						end_pfn, nr_pages, last_pfn,
+						&identity, &remapped,
+						&num_released);
 			start = end;
 		}
 	}
 
-	if (released)
-		printk(KERN_INFO "Released %lu pages of unused memory\n", released);
-	if (identity)
-		printk(KERN_INFO "Set %ld page(s) to 1-1 mapping\n", identity);
+	*released = num_released;
 
-	return released;
-}
+	pr_info("Set %ld page(s) to 1-1 mapping\n", identity);
+	pr_info("Remapped %ld page(s), last_pfn=%ld\n", remapped,
+		last_pfn);
+	pr_info("Released %ld page(s)\n", num_released);
 
+	return last_pfn;
+}
 static unsigned long __init xen_get_max_pages(void)
 {
 	unsigned long max_pages = MAX_DOMAIN_PAGES;
@@ -347,7 +571,6 @@ char * __init xen_memory_setup(void)
 	unsigned long max_pages;
 	unsigned long last_pfn = 0;
 	unsigned long extra_pages = 0;
-	unsigned long populated;
 	int i;
 	int op;
 
@@ -392,20 +615,11 @@ char * __init xen_memory_setup(void)
 		extra_pages += max_pages - max_pfn;
 
 	/*
-	 * Set P2M for all non-RAM pages and E820 gaps to be identity
-	 * type PFNs.  Any RAM pages that would be made inaccesible by
-	 * this are first released.
+	 * Set identity map on non-RAM pages and remap the underlying RAM.
 	 */
-	xen_released_pages = xen_set_identity_and_release(
-		map, memmap.nr_entries, max_pfn);
-
-	/*
-	 * Populate back the non-RAM pages and E820 gaps that had been
-	 * released. */
-	populated = xen_populate_chunk(map, memmap.nr_entries,
-			max_pfn, &last_pfn, xen_released_pages);
+	last_pfn = xen_set_identity_and_remap(map, memmap.nr_entries, max_pfn,
+					      &xen_released_pages);
 
-	xen_released_pages -= populated;
 	extra_pages += xen_released_pages;
 
 	if (last_pfn > max_pfn) {
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 7005974c3ff3..c670d7518cf4 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -37,6 +37,7 @@
 #include <xen/hvc-console.h>
 #include "xen-ops.h"
 #include "mmu.h"
+#include "smp.h"
 
 cpumask_var_t xen_cpu_initialized_map;
 
@@ -99,10 +100,14 @@ static void cpu_bringup(void)
 	wmb();			/* make sure everything is out */
 }
 
-/* Note: cpu parameter is only relevant for PVH */
-static void cpu_bringup_and_idle(int cpu)
+/*
+ * Note: cpu parameter is only relevant for PVH. The reason for passing it
+ * is we can't do smp_processor_id until the percpu segments are loaded, for
+ * which we need the cpu number! So we pass it in rdi as first parameter.
+ */
+asmlinkage __visible void cpu_bringup_and_idle(int cpu)
 {
-#ifdef CONFIG_X86_64
+#ifdef CONFIG_XEN_PVH
 	if (xen_feature(XENFEAT_auto_translated_physmap) &&
 	    xen_feature(XENFEAT_supervisor_mode_kernel))
 		xen_pvh_secondary_vcpu_init(cpu);
@@ -374,11 +379,10 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	ctxt->user_regs.fs = __KERNEL_PERCPU;
 	ctxt->user_regs.gs = __KERNEL_STACK_CANARY;
 #endif
-	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
-
 	memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
 
 	if (!xen_feature(XENFEAT_auto_translated_physmap)) {
+		ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
 		ctxt->flags = VGCF_IN_KERNEL;
 		ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
 		ctxt->user_regs.ds = __USER_DS;
@@ -413,15 +417,18 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 					(unsigned long)xen_failsafe_callback;
 		ctxt->user_regs.cs = __KERNEL_CS;
 		per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
-#ifdef CONFIG_X86_32
 	}
-#else
-	} else
-		/* N.B. The user_regs.eip (cpu_bringup_and_idle) is called with
-		 * %rdi having the cpu number - which means are passing in
-		 * as the first parameter the cpu. Subtle!
+#ifdef CONFIG_XEN_PVH
+	else {
+		/*
+		 * The vcpu comes on kernel page tables which have the NX pte
+		 * bit set. This means before DS/SS is touched, NX in
+		 * EFER must be set. Hence the following assembly glue code.
 		 */
+		ctxt->user_regs.eip = (unsigned long)xen_pvh_early_cpu_init;
 		ctxt->user_regs.rdi = cpu;
+		ctxt->user_regs.rsi = true;  /* entry == true */
+	}
 #endif
 	ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
 	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_mfn(swapper_pg_dir));
diff --git a/arch/x86/xen/smp.h b/arch/x86/xen/smp.h
index c7c2d89efd76..963d62a35c82 100644
--- a/arch/x86/xen/smp.h
+++ b/arch/x86/xen/smp.h
@@ -8,4 +8,12 @@ extern void xen_send_IPI_allbutself(int vector);
 extern void xen_send_IPI_all(int vector);
 extern void xen_send_IPI_self(int vector);
 
+#ifdef CONFIG_XEN_PVH
+extern void xen_pvh_early_cpu_init(int cpu, bool entry);
+#else
+static inline void xen_pvh_early_cpu_init(int cpu, bool entry)
+{
+}
+#endif
+
 #endif
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index 485b69585540..674b222544b7 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -47,6 +47,41 @@ ENTRY(startup_xen)
 
 	__FINIT
 
+#ifdef CONFIG_XEN_PVH
+/*
+ * xen_pvh_early_cpu_init() - early PVH VCPU initialization
+ * @cpu:   this cpu number (%rdi)
+ * @entry: true if this is a secondary vcpu coming up on this entry
+ *         point, false if this is the boot CPU being initialized for
+ *         the first time (%rsi)
+ *
+ * Note: This is called as a function on the boot CPU, and is the entry point
+ *       on the secondary CPU.
+ */
+ENTRY(xen_pvh_early_cpu_init)
+	mov     %rsi, %r11
+
+	/* Gather features to see if NX implemented. */
+	mov     $0x80000001, %eax
+	cpuid
+	mov     %edx, %esi
+
+	mov     $MSR_EFER, %ecx
+	rdmsr
+	bts     $_EFER_SCE, %eax
+
+	bt      $20, %esi
+	jnc     1f      	/* No NX, skip setting it */
+	bts     $_EFER_NX, %eax
+1:	wrmsr
+#ifdef CONFIG_SMP
+	cmp     $0, %r11b
+	jne     cpu_bringup_and_idle
+#endif
+	ret
+
+#endif /* CONFIG_XEN_PVH */
+
 .pushsection .text
 	.balign PAGE_SIZE
 ENTRY(hypercall_page)
@@ -124,6 +159,7 @@ NEXT_HYPERCALL(arch_6)
 	ELFNOTE(Xen, XEN_ELFNOTE_L1_MFN_VALID,
 		.quad _PAGE_PRESENT; .quad _PAGE_PRESENT)
 	ELFNOTE(Xen, XEN_ELFNOTE_SUSPEND_CANCEL, .long 1)
+	ELFNOTE(Xen, XEN_ELFNOTE_MOD_START_PFN,  .long 1)
 	ELFNOTE(Xen, XEN_ELFNOTE_HV_START_LOW,   _ASM_PTR __HYPERVISOR_VIRT_START)
 	ELFNOTE(Xen, XEN_ELFNOTE_PADDR_OFFSET,   _ASM_PTR 0)