Merge branch 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull x86 core updates from Ingo Molnar: "Note that in this cycle most of the x86 topics interacted at a level that caused them to be merged into tip:x86/asm - but this should be a temporary phenomenon, hopefully we'll back to the usual patterns in the next merge window. The main changes in this cycle were: Hardware enablement: - Add support for the Intel UMIP (User Mode Instruction Prevention) CPU feature. This is a security feature that disables certain instructions such as SGDT, SLDT, SIDT, SMSW and STR. (Ricardo Neri) [ Note that this is disabled by default for now, there are some smaller enhancements in the pipeline that I'll follow up with in the next 1-2 days, which allows this to be enabled by default.] - Add support for the AMD SEV (Secure Encrypted Virtualization) CPU feature, on top of SME (Secure Memory Encryption) support that was added in v4.14. (Tom Lendacky, Brijesh Singh) - Enable new SSE/AVX/AVX512 CPU features: AVX512_VBMI2, GFNI, VAES, VPCLMULQDQ, AVX512_VNNI, AVX512_BITALG. (Gayatri Kammela) Other changes: - A big series of entry code simplifications and enhancements (Andy Lutomirski) - Make the ORC unwinder default on x86 and various objtool enhancements. (Josh Poimboeuf) - 5-level paging enhancements (Kirill A. Shutemov) - Micro-optimize the entry code a bit (Borislav Petkov) - Improve the handling of interdependent CPU features in the early FPU init code (Andi Kleen) - Build system enhancements (Changbin Du, Masahiro Yamada) - ... plus misc enhancements, fixes and cleanups" * 'x86-asm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (118 commits) x86/build: Make the boot image generation less verbose selftests/x86: Add tests for the STR and SLDT instructions selftests/x86: Add tests for User-Mode Instruction Prevention x86/traps: Fix up general protection faults caused by UMIP x86/umip: Enable User-Mode Instruction Prevention at runtime x86/umip: Force a page fault when unable to copy emulated result to user x86/umip: Add emulation code for UMIP instructions x86/cpufeature: Add User-Mode Instruction Prevention definitions x86/insn-eval: Add support to resolve 16-bit address encodings x86/insn-eval: Handle 32-bit address encodings in virtual-8086 mode x86/insn-eval: Add wrapper function for 32 and 64-bit addresses x86/insn-eval: Add support to resolve 32-bit address encodings x86/insn-eval: Compute linear address in several utility functions resource: Fix resource_size.cocci warnings X86/KVM: Clear encryption attribute when SEV is active X86/KVM: Decrypt shared per-cpu variables when SEV is active percpu: Introduce DEFINE_PER_CPU_DECRYPTED x86: Add support for changing memory encryption attribute in early boot x86/io: Unroll string I/O when SEV is active x86/boot: Add early boot support when running with SEV active ...
author: Linus Torvalds <torvalds@linux-foundation.org> 2017-11-13 14:13:48 -0800
committer: Linus Torvalds <torvalds@linux-foundation.org> 2017-11-13 14:13:48 -0800
commit: d6ec9d9a4def52a5094237564eaf6f6979fd7a27 (patch)
tree: adfb80f83f04a021e82cb25227b64b1bb9e793dc /arch/x86
parent: 3e2014637c50e5d6a77cd63d5db6c209fe29d1b1 (diff)
parent: 91a6a6cfee8ad34ea4cc10a54c0765edfe437cdb (diff)
download: linux-d6ec9d9a4def52a5094237564eaf6f6979fd7a27.tar.gz
85 files changed, 3657 insertions, 1015 deletions
diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 9bceea6a5852..28fe465c7b7b 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -171,7 +171,7 @@ config X86
 	select HAVE_PERF_USER_STACK_DUMP
 	select HAVE_RCU_TABLE_FREE
 	select HAVE_REGS_AND_STACK_ACCESS_API
-	select HAVE_RELIABLE_STACKTRACE		if X86_64 && FRAME_POINTER_UNWINDER && STACK_VALIDATION
+	select HAVE_RELIABLE_STACKTRACE		if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION
 	select HAVE_STACK_VALIDATION		if X86_64
 	select HAVE_SYSCALL_TRACEPOINTS
 	select HAVE_UNSTABLE_SCHED_CLOCK
@@ -303,7 +303,6 @@ config ARCH_SUPPORTS_DEBUG_PAGEALLOC
 config KASAN_SHADOW_OFFSET
 	hex
 	depends on KASAN
-	default 0xdff8000000000000 if X86_5LEVEL
 	default 0xdffffc0000000000
 
 config HAVE_INTEL_TXT
@@ -1803,6 +1802,16 @@ config X86_SMAP
 
 	  If unsure, say Y.
 
+config X86_INTEL_UMIP
+	def_bool n
+	depends on CPU_SUP_INTEL
+	prompt "Intel User Mode Instruction Prevention" if EXPERT
+	---help---
+	  The User Mode Instruction Prevention (UMIP) is a security
+	  feature in newer Intel processors. If enabled, a general
+	  protection fault is issued if the instructions SGDT, SLDT,
+	  SIDT, SMSW and STR are executed in user mode.
+
 config X86_INTEL_MPX
 	prompt "Intel MPX (Memory Protection Extensions)"
 	def_bool n
diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug
index 90b123056f4b..6293a8768a91 100644
--- a/arch/x86/Kconfig.debug
+++ b/arch/x86/Kconfig.debug
@@ -359,28 +359,14 @@ config PUNIT_ATOM_DEBUG
 
 choice
 	prompt "Choose kernel unwinder"
-	default FRAME_POINTER_UNWINDER
+	default UNWINDER_ORC if X86_64
+	default UNWINDER_FRAME_POINTER if X86_32
 	---help---
 	  This determines which method will be used for unwinding kernel stack
 	  traces for panics, oopses, bugs, warnings, perf, /proc/<pid>/stack,
 	  livepatch, lockdep, and more.
 
-config FRAME_POINTER_UNWINDER
-	bool "Frame pointer unwinder"
-	select FRAME_POINTER
-	---help---
-	  This option enables the frame pointer unwinder for unwinding kernel
-	  stack traces.
-
-	  The unwinder itself is fast and it uses less RAM than the ORC
-	  unwinder, but the kernel text size will grow by ~3% and the kernel's
-	  overall performance will degrade by roughly 5-10%.
-
-	  This option is recommended if you want to use the livepatch
-	  consistency model, as this is currently the only way to get a
-	  reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
-
-config ORC_UNWINDER
+config UNWINDER_ORC
 	bool "ORC unwinder"
 	depends on X86_64
 	select STACK_VALIDATION
@@ -396,7 +382,22 @@ config ORC_UNWINDER
 	  Enabling this option will increase the kernel's runtime memory usage
 	  by roughly 2-4MB, depending on your kernel config.
 
-config GUESS_UNWINDER
+config UNWINDER_FRAME_POINTER
+	bool "Frame pointer unwinder"
+	select FRAME_POINTER
+	---help---
+	  This option enables the frame pointer unwinder for unwinding kernel
+	  stack traces.
+
+	  The unwinder itself is fast and it uses less RAM than the ORC
+	  unwinder, but the kernel text size will grow by ~3% and the kernel's
+	  overall performance will degrade by roughly 5-10%.
+
+	  This option is recommended if you want to use the livepatch
+	  consistency model, as this is currently the only way to get a
+	  reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE).
+
+config UNWINDER_GUESS
 	bool "Guess unwinder"
 	depends on EXPERT
 	---help---
@@ -411,7 +412,7 @@ config GUESS_UNWINDER
 endchoice
 
 config FRAME_POINTER
-	depends on !ORC_UNWINDER && !GUESS_UNWINDER
+	depends on !UNWINDER_ORC && !UNWINDER_GUESS
 	bool
 
 endmenu
diff --git a/arch/x86/boot/.gitignore b/arch/x86/boot/.gitignore
index e3cf9f682be5..09d25dd09307 100644
--- a/arch/x86/boot/.gitignore
+++ b/arch/x86/boot/.gitignore
@@ -7,3 +7,6 @@ zoffset.h
 setup
 setup.bin
 setup.elf
+fdimage
+mtools.conf
+image.iso
diff --git a/arch/x86/boot/Makefile b/arch/x86/boot/Makefile
index d88a2fddba8c..9b5adae9cc40 100644
--- a/arch/x86/boot/Makefile
+++ b/arch/x86/boot/Makefile
@@ -123,63 +123,26 @@ image_cmdline = default linux $(FDARGS) $(if $(FDINITRD),initrd=initrd.img,)
 $(obj)/mtools.conf: $(src)/mtools.conf.in
 	sed -e 's|@OBJ@|$(obj)|g' < $< > $@
 
+quiet_cmd_genimage = GENIMAGE $3
+cmd_genimage = sh $(srctree)/$(src)/genimage.sh $2 $3 $(obj)/bzImage \
+			$(obj)/mtools.conf '$(image_cmdline)' $(FDINITRD)
+
 # This requires write access to /dev/fd0
 bzdisk: $(obj)/bzImage $(obj)/mtools.conf
-	MTOOLSRC=$(obj)/mtools.conf mformat a:			; sync
-	syslinux /dev/fd0					; sync
-	echo '$(image_cmdline)' | \
-		MTOOLSRC=$(src)/mtools.conf mcopy - a:syslinux.cfg
-	if [ -f '$(FDINITRD)' ] ; then \
-		MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' a:initrd.img ; \
-	fi
-	MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage a:linux	; sync
+	$(call cmd,genimage,bzdisk,/dev/fd0)
 
 # These require being root or having syslinux 2.02 or higher installed
 fdimage fdimage144: $(obj)/bzImage $(obj)/mtools.conf
-	dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=1440
-	MTOOLSRC=$(obj)/mtools.conf mformat v:			; sync
-	syslinux $(obj)/fdimage					; sync
-	echo '$(image_cmdline)' | \
-		MTOOLSRC=$(obj)/mtools.conf mcopy - v:syslinux.cfg
-	if [ -f '$(FDINITRD)' ] ; then \
-		MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' v:initrd.img ; \
-	fi
-	MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage v:linux	; sync
+	$(call cmd,genimage,fdimage144,$(obj)/fdimage)
+	@$(kecho) 'Kernel: $(obj)/fdimage is ready'
 
 fdimage288: $(obj)/bzImage $(obj)/mtools.conf
-	dd if=/dev/zero of=$(obj)/fdimage bs=1024 count=2880
-	MTOOLSRC=$(obj)/mtools.conf mformat w:			; sync
-	syslinux $(obj)/fdimage					; sync
-	echo '$(image_cmdline)' | \
-		MTOOLSRC=$(obj)/mtools.conf mcopy - w:syslinux.cfg
-	if [ -f '$(FDINITRD)' ] ; then \
-		MTOOLSRC=$(obj)/mtools.conf mcopy '$(FDINITRD)' w:initrd.img ; \
-	fi
-	MTOOLSRC=$(obj)/mtools.conf mcopy $(obj)/bzImage w:linux	; sync
+	$(call cmd,genimage,fdimage288,$(obj)/fdimage)
+	@$(kecho) 'Kernel: $(obj)/fdimage is ready'
 
 isoimage: $(obj)/bzImage
-	-rm -rf $(obj)/isoimage
-	mkdir $(obj)/isoimage
-	for i in lib lib64 share end ; do \
-		if [ -f /usr/$$i/syslinux/isolinux.bin ] ; then \
-			cp /usr/$$i/syslinux/isolinux.bin $(obj)/isoimage ; \
-			if [ -f /usr/$$i/syslinux/ldlinux.c32 ]; then \
-				cp /usr/$$i/syslinux/ldlinux.c32 $(obj)/isoimage ; \
-			fi ; \
-			break ; \
-		fi ; \
-		if [ $$i = end ] ; then exit 1 ; fi ; \
-	done
-	cp $(obj)/bzImage $(obj)/isoimage/linux
-	echo '$(image_cmdline)' > $(obj)/isoimage/isolinux.cfg
-	if [ -f '$(FDINITRD)' ] ; then \
-		cp '$(FDINITRD)' $(obj)/isoimage/initrd.img ; \
-	fi
-	mkisofs -J -r -o $(obj)/image.iso -b isolinux.bin -c boot.cat \
-		-no-emul-boot -boot-load-size 4 -boot-info-table \
-		$(obj)/isoimage
-	isohybrid $(obj)/image.iso 2>/dev/null || true
-	rm -rf $(obj)/isoimage
+	$(call cmd,genimage,isoimage,$(obj)/image.iso)
+	@$(kecho) 'Kernel: $(obj)/image.iso is ready'
 
 bzlilo: $(obj)/bzImage
 	if [ -f $(INSTALL_PATH)/vmlinuz ]; then mv $(INSTALL_PATH)/vmlinuz $(INSTALL_PATH)/vmlinuz.old; fi
diff --git a/arch/x86/boot/compressed/Makefile b/arch/x86/boot/compressed/Makefile
index 4b7575b00563..61827c2282bf 100644
--- a/arch/x86/boot/compressed/Makefile
+++ b/arch/x86/boot/compressed/Makefile
@@ -78,6 +78,7 @@ vmlinux-objs-$(CONFIG_EARLY_PRINTK) += $(obj)/early_serial_console.o
 vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/kaslr.o
 ifdef CONFIG_X86_64
 	vmlinux-objs-$(CONFIG_RANDOMIZE_BASE) += $(obj)/pagetable.o
+	vmlinux-objs-y += $(obj)/mem_encrypt.o
 endif
 
 $(obj)/eboot.o: KBUILD_CFLAGS += -fshort-wchar -mno-red-zone
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index beb255b66447..20919b4f3133 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -131,6 +131,19 @@ ENTRY(startup_32)
  /*
   * Build early 4G boot pagetable
   */
+	/*
+	 * If SEV is active then set the encryption mask in the page tables.
+	 * This will insure that when the kernel is copied and decompressed
+	 * it will be done so encrypted.
+	 */
+	call	get_sev_encryption_bit
+	xorl	%edx, %edx
+	testl	%eax, %eax
+	jz	1f
+	subl	$32, %eax	/* Encryption bit is always above bit 31 */
+	bts	%eax, %edx	/* Set encryption mask for page tables */
+1:
+
 	/* Initialize Page tables to 0 */
 	leal	pgtable(%ebx), %edi
 	xorl	%eax, %eax
@@ -141,12 +154,14 @@ ENTRY(startup_32)
 	leal	pgtable + 0(%ebx), %edi
 	leal	0x1007 (%edi), %eax
 	movl	%eax, 0(%edi)
+	addl	%edx, 4(%edi)
 
 	/* Build Level 3 */
 	leal	pgtable + 0x1000(%ebx), %edi
 	leal	0x1007(%edi), %eax
 	movl	$4, %ecx
 1:	movl	%eax, 0x00(%edi)
+	addl	%edx, 0x04(%edi)
 	addl	$0x00001000, %eax
 	addl	$8, %edi
 	decl	%ecx
@@ -157,6 +172,7 @@ ENTRY(startup_32)
 	movl	$0x00000183, %eax
 	movl	$2048, %ecx
 1:	movl	%eax, 0(%edi)
+	addl	%edx, 4(%edi)
 	addl	$0x00200000, %eax
 	addl	$8, %edi
 	decl	%ecx
diff --git a/arch/x86/boot/compressed/mem_encrypt.S b/arch/x86/boot/compressed/mem_encrypt.S
new file mode 100644
index 000000000000..54f5f6625a73
--- /dev/null
+++ b/arch/x86/boot/compressed/mem_encrypt.S
@@ -0,0 +1,120 @@
+/*
+ * AMD Memory Encryption Support
+ *
+ * Copyright (C) 2017 Advanced Micro Devices, Inc.
+ *
+ * Author: Tom Lendacky <thomas.lendacky@amd.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/linkage.h>
+
+#include <asm/processor-flags.h>
+#include <asm/msr.h>
+#include <asm/asm-offsets.h>
+
+	.text
+	.code32
+ENTRY(get_sev_encryption_bit)
+	xor	%eax, %eax
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+	push	%ebx
+	push	%ecx
+	push	%edx
+	push	%edi
+
+	/*
+	 * RIP-relative addressing is needed to access the encryption bit
+	 * variable. Since we are running in 32-bit mode we need this call/pop
+	 * sequence to get the proper relative addressing.
+	 */
+	call	1f
+1:	popl	%edi
+	subl	$1b, %edi
+
+	movl	enc_bit(%edi), %eax
+	cmpl	$0, %eax
+	jge	.Lsev_exit
+
+	/* Check if running under a hypervisor */
+	movl	$1, %eax
+	cpuid
+	bt	$31, %ecx		/* Check the hypervisor bit */
+	jnc	.Lno_sev
+
+	movl	$0x80000000, %eax	/* CPUID to check the highest leaf */
+	cpuid
+	cmpl	$0x8000001f, %eax	/* See if 0x8000001f is available */
+	jb	.Lno_sev
+
+	/*
+	 * Check for the SEV feature:
+	 *   CPUID Fn8000_001F[EAX] - Bit 1
+	 *   CPUID Fn8000_001F[EBX] - Bits 5:0
+	 *     Pagetable bit position used to indicate encryption
+	 */
+	movl	$0x8000001f, %eax
+	cpuid
+	bt	$1, %eax		/* Check if SEV is available */
+	jnc	.Lno_sev
+
+	movl	$MSR_AMD64_SEV, %ecx	/* Read the SEV MSR */
+	rdmsr
+	bt	$MSR_AMD64_SEV_ENABLED_BIT, %eax	/* Check if SEV is active */
+	jnc	.Lno_sev
+
+	movl	%ebx, %eax
+	andl	$0x3f, %eax		/* Return the encryption bit location */
+	movl	%eax, enc_bit(%edi)
+	jmp	.Lsev_exit
+
+.Lno_sev:
+	xor	%eax, %eax
+	movl	%eax, enc_bit(%edi)
+
+.Lsev_exit:
+	pop	%edi
+	pop	%edx
+	pop	%ecx
+	pop	%ebx
+
+#endif	/* CONFIG_AMD_MEM_ENCRYPT */
+
+	ret
+ENDPROC(get_sev_encryption_bit)
+
+	.code64
+ENTRY(get_sev_encryption_mask)
+	xor	%rax, %rax
+
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+	push	%rbp
+	push	%rdx
+
+	movq	%rsp, %rbp		/* Save current stack pointer */
+
+	call	get_sev_encryption_bit	/* Get the encryption bit position */
+	testl	%eax, %eax
+	jz	.Lno_sev_mask
+
+	xor	%rdx, %rdx
+	bts	%rax, %rdx		/* Create the encryption mask */
+	mov	%rdx, %rax		/* ... and return it */
+
+.Lno_sev_mask:
+	movq	%rbp, %rsp		/* Restore original stack pointer */
+
+	pop	%rdx
+	pop	%rbp
+#endif
+
+	ret
+ENDPROC(get_sev_encryption_mask)
+
+	.data
+enc_bit:
+	.int	0xffffffff
diff --git a/arch/x86/boot/compressed/misc.h b/arch/x86/boot/compressed/misc.h
index 32d4ec2e0243..9d323dc6b159 100644
--- a/arch/x86/boot/compressed/misc.h
+++ b/arch/x86/boot/compressed/misc.h
@@ -109,4 +109,6 @@ static inline void console_init(void)
 { }
 #endif
 
+unsigned long get_sev_encryption_mask(void);
+
 #endif
diff --git a/arch/x86/boot/compressed/pagetable.c b/arch/x86/boot/compressed/pagetable.c
index 972319ff5b01..d5364ca2e3f9 100644
--- a/arch/x86/boot/compressed/pagetable.c
+++ b/arch/x86/boot/compressed/pagetable.c
@@ -77,16 +77,18 @@ static unsigned long top_level_pgt;
  * Mapping information structure passed to kernel_ident_mapping_init().
  * Due to relocation, pointers must be assigned at run time not build time.
  */
-static struct x86_mapping_info mapping_info = {
-	.page_flag       = __PAGE_KERNEL_LARGE_EXEC,
-};
+static struct x86_mapping_info mapping_info;
 
 /* Locates and clears a region for a new top level page table. */
 void initialize_identity_maps(void)
 {
+	unsigned long sev_me_mask = get_sev_encryption_mask();
+
 	/* Init mapping_info with run-time function/buffer pointers. */
 	mapping_info.alloc_pgt_page = alloc_pgt_page;
 	mapping_info.context = &pgt_data;
+	mapping_info.page_flag = __PAGE_KERNEL_LARGE_EXEC | sev_me_mask;
+	mapping_info.kernpg_flag = _KERNPG_TABLE | sev_me_mask;
 
 	/*
 	 * It should be impossible for this not to already be true,
diff --git a/arch/x86/boot/genimage.sh b/arch/x86/boot/genimage.sh
new file mode 100644
index 000000000000..49f4970f693b
--- /dev/null
+++ b/arch/x86/boot/genimage.sh
@@ -0,0 +1,124 @@
+#!/bin/sh
+#
+# This file is subject to the terms and conditions of the GNU General Public
+# License.  See the file "COPYING" in the main directory of this archive
+# for more details.
+#
+# Copyright (C) 2017 by Changbin Du <changbin.du@intel.com>
+#
+# Adapted from code in arch/x86/boot/Makefile by H. Peter Anvin and others
+#
+# "make fdimage/fdimage144/fdimage288/isoimage" script for x86 architecture
+#
+# Arguments:
+#   $1 - fdimage format
+#   $2 - target image file
+#   $3 - kernel bzImage file
+#   $4 - mtool configuration file
+#   $5 - kernel cmdline
+#   $6 - inird image file
+#
+
+# Use "make V=1" to debug this script
+case "${KBUILD_VERBOSE}" in
+*1*)
+        set -x
+        ;;
+esac
+
+verify () {
+	if [ ! -f "$1" ]; then
+		echo ""                                                   1>&2
+		echo " *** Missing file: $1"                              1>&2
+		echo ""                                                   1>&2
+		exit 1
+	fi
+}
+
+
+export MTOOLSRC=$4
+FIMAGE=$2
+FBZIMAGE=$3
+KCMDLINE=$5
+FDINITRD=$6
+
+# Make sure the files actually exist
+verify "$FBZIMAGE"
+verify "$MTOOLSRC"
+
+genbzdisk() {
+	mformat a:
+	syslinux $FIMAGE
+	echo "$KCMDLINE" | mcopy - a:syslinux.cfg
+	if [ -f "$FDINITRD" ] ; then
+		mcopy "$FDINITRD" a:initrd.img
+	fi
+	mcopy $FBZIMAGE a:linux
+}
+
+genfdimage144() {
+	dd if=/dev/zero of=$FIMAGE bs=1024 count=1440 2> /dev/null
+	mformat v:
+	syslinux $FIMAGE
+	echo "$KCMDLINE" | mcopy - v:syslinux.cfg
+	if [ -f "$FDINITRD" ] ; then
+		mcopy "$FDINITRD" v:initrd.img
+	fi
+	mcopy $FBZIMAGE v:linux
+}
+
+genfdimage288() {
+	dd if=/dev/zero of=$FIMAGE bs=1024 count=2880 2> /dev/null
+	mformat w:
+	syslinux $FIMAGE
+	echo "$KCMDLINE" | mcopy - W:syslinux.cfg
+	if [ -f "$FDINITRD" ] ; then
+		mcopy "$FDINITRD" w:initrd.img
+	fi
+	mcopy $FBZIMAGE w:linux
+}
+
+genisoimage() {
+	tmp_dir=`dirname $FIMAGE`/isoimage
+	rm -rf $tmp_dir
+	mkdir $tmp_dir
+	for i in lib lib64 share end ; do
+		for j in syslinux ISOLINUX ; do
+			if [ -f /usr/$i/$j/isolinux.bin ] ; then
+				isolinux=/usr/$i/$j/isolinux.bin
+				cp $isolinux $tmp_dir
+			fi
+		done
+		for j in syslinux syslinux/modules/bios ; do
+			if [ -f /usr/$i/$j/ldlinux.c32 ]; then
+				ldlinux=/usr/$i/$j/ldlinux.c32
+				cp $ldlinux $tmp_dir
+			fi
+		done
+		if [ -n "$isolinux" -a -n "$ldlinux" ] ; then
+			break
+		fi
+		if [ $i = end -a -z "$isolinux" ] ; then
+			echo 'Need an isolinux.bin file, please install syslinux/isolinux.'
+			exit 1
+		fi
+	done
+	cp $FBZIMAGE $tmp_dir/linux
+	echo "$KCMDLINE" > $tmp_dir/isolinux.cfg
+	if [ -f "$FDINITRD" ] ; then
+		cp "$FDINITRD" $tmp_dir/initrd.img
+	fi
+	mkisofs -J -r -input-charset=utf-8 -quiet -o $FIMAGE -b isolinux.bin \
+		-c boot.cat -no-emul-boot -boot-load-size 4 -boot-info-table \
+		$tmp_dir
+	isohybrid $FIMAGE 2>/dev/null || true
+	rm -rf $tmp_dir
+}
+
+case $1 in
+	bzdisk)     genbzdisk;;
+	fdimage144) genfdimage144;;
+	fdimage288) genfdimage288;;
+	isoimage)   genisoimage;;
+	*)          echo 'Unknown image format'; exit 1;
+esac
diff --git a/arch/x86/configs/tiny.config b/arch/x86/configs/tiny.config
index 550cd5012b73..66c9e2aab16c 100644
--- a/arch/x86/configs/tiny.config
+++ b/arch/x86/configs/tiny.config
@@ -1,5 +1,5 @@
 CONFIG_NOHIGHMEM=y
 # CONFIG_HIGHMEM4G is not set
 # CONFIG_HIGHMEM64G is not set
-CONFIG_GUESS_UNWINDER=y
-# CONFIG_FRAME_POINTER_UNWINDER is not set
+CONFIG_UNWINDER_GUESS=y
+# CONFIG_UNWINDER_FRAME_POINTER is not set
diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig
index 4a4b16e56d35..e32fc1f274d8 100644
--- a/arch/x86/configs/x86_64_defconfig
+++ b/arch/x86/configs/x86_64_defconfig
@@ -299,6 +299,7 @@ CONFIG_DEBUG_STACKOVERFLOW=y
 # CONFIG_DEBUG_RODATA_TEST is not set
 CONFIG_DEBUG_BOOT_PARAMS=y
 CONFIG_OPTIMIZE_INLINING=y
+CONFIG_UNWINDER_ORC=y
 CONFIG_SECURITY=y
 CONFIG_SECURITY_NETWORK=y
 CONFIG_SECURITY_SELINUX=y
diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 6e160031cfea..3fd8bc560fae 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -142,56 +142,25 @@ For 32-bit we have the following conventions - kernel is built with
 	UNWIND_HINT_REGS offset=\offset
 	.endm
 
-	.macro RESTORE_EXTRA_REGS offset=0
-	movq 0*8+\offset(%rsp), %r15
-	movq 1*8+\offset(%rsp), %r14
-	movq 2*8+\offset(%rsp), %r13
-	movq 3*8+\offset(%rsp), %r12
-	movq 4*8+\offset(%rsp), %rbp
-	movq 5*8+\offset(%rsp), %rbx
-	UNWIND_HINT_REGS offset=\offset extra=0
-	.endm
-
-	.macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
-	.if \rstor_r11
-	movq 6*8(%rsp), %r11
-	.endif
-	.if \rstor_r8910
-	movq 7*8(%rsp), %r10
-	movq 8*8(%rsp), %r9
-	movq 9*8(%rsp), %r8
-	.endif
-	.if \rstor_rax
-	movq 10*8(%rsp), %rax
-	.endif
-	.if \rstor_rcx
-	movq 11*8(%rsp), %rcx
-	.endif
-	.if \rstor_rdx
-	movq 12*8(%rsp), %rdx
-	.endif
-	movq 13*8(%rsp), %rsi
-	movq 14*8(%rsp), %rdi
-	UNWIND_HINT_IRET_REGS offset=16*8
-	.endm
-	.macro RESTORE_C_REGS
-	RESTORE_C_REGS_HELPER 1,1,1,1,1
-	.endm
-	.macro RESTORE_C_REGS_EXCEPT_RAX
-	RESTORE_C_REGS_HELPER 0,1,1,1,1
-	.endm
-	.macro RESTORE_C_REGS_EXCEPT_RCX
-	RESTORE_C_REGS_HELPER 1,0,1,1,1
-	.endm
-	.macro RESTORE_C_REGS_EXCEPT_R11
-	RESTORE_C_REGS_HELPER 1,1,0,1,1
-	.endm
-	.macro RESTORE_C_REGS_EXCEPT_RCX_R11
-	RESTORE_C_REGS_HELPER 1,0,0,1,1
-	.endm
-
-	.macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
-	subq $-(15*8+\addskip), %rsp
+	.macro POP_EXTRA_REGS
+	popq %r15
+	popq %r14
+	popq %r13
+	popq %r12
+	popq %rbp
+	popq %rbx
+	.endm
+
+	.macro POP_C_REGS
+	popq %r11
+	popq %r10
+	popq %r9
+	popq %r8
+	popq %rax
+	popq %rcx
+	popq %rdx
+	popq %rsi
+	popq %rdi
 	.endm
 
 	.macro icebp
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index bcfc5668dcb2..a2b30ec69497 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -221,10 +221,9 @@ entry_SYSCALL_64_fastpath:
 	TRACE_IRQS_ON		/* user mode is traced as IRQs on */
 	movq	RIP(%rsp), %rcx
 	movq	EFLAGS(%rsp), %r11
-	RESTORE_C_REGS_EXCEPT_RCX_R11
-	movq	RSP(%rsp), %rsp
+	addq	$6*8, %rsp	/* skip extra regs -- they were preserved */
 	UNWIND_HINT_EMPTY
-	USERGS_SYSRET64
+	jmp	.Lpop_c_regs_except_rcx_r11_and_sysret
 
 1:
 	/*
@@ -246,17 +245,18 @@ entry_SYSCALL64_slow_path:
 	call	do_syscall_64		/* returns with IRQs disabled */
 
 return_from_SYSCALL_64:
-	RESTORE_EXTRA_REGS
 	TRACE_IRQS_IRETQ		/* we're about to change IF */
 
 	/*
 	 * Try to use SYSRET instead of IRET if we're returning to
-	 * a completely clean 64-bit userspace context.
+	 * a completely clean 64-bit userspace context.  If we're not,
+	 * go to the slow exit path.
 	 */
 	movq	RCX(%rsp), %rcx
 	movq	RIP(%rsp), %r11
-	cmpq	%rcx, %r11			/* RCX == RIP */
-	jne	opportunistic_sysret_failed
+
+	cmpq	%rcx, %r11	/* SYSRET requires RCX == RIP */
+	jne	swapgs_restore_regs_and_return_to_usermode
 
 	/*
 	 * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
@@ -274,14 +274,14 @@ return_from_SYSCALL_64:
 
 	/* If this changed %rcx, it was not canonical */
 	cmpq	%rcx, %r11
-	jne	opportunistic_sysret_failed
+	jne	swapgs_restore_regs_and_return_to_usermode
 
 	cmpq	$__USER_CS, CS(%rsp)		/* CS must match SYSRET */
-	jne	opportunistic_sysret_failed
+	jne	swapgs_restore_regs_and_return_to_usermode
 
 	movq	R11(%rsp), %r11
 	cmpq	%r11, EFLAGS(%rsp)		/* R11 == RFLAGS */
-	jne	opportunistic_sysret_failed
+	jne	swapgs_restore_regs_and_return_to_usermode
 
 	/*
 	 * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot
@@ -302,12 +302,12 @@ return_from_SYSCALL_64:
 	 * would never get past 'stuck_here'.
 	 */
 	testq	$(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
-	jnz	opportunistic_sysret_failed
+	jnz	swapgs_restore_regs_and_return_to_usermode
 
 	/* nothing to check for RSP */
 
 	cmpq	$__USER_DS, SS(%rsp)		/* SS must match SYSRET */
-	jne	opportunistic_sysret_failed
+	jne	swapgs_restore_regs_and_return_to_usermode
 
 	/*
 	 * We win! This label is here just for ease of understanding
@@ -315,14 +315,20 @@ return_from_SYSCALL_64:
 	 */
 syscall_return_via_sysret:
 	/* rcx and r11 are already restored (see code above) */
-	RESTORE_C_REGS_EXCEPT_RCX_R11
-	movq	RSP(%rsp), %rsp
 	UNWIND_HINT_EMPTY
+	POP_EXTRA_REGS
+.Lpop_c_regs_except_rcx_r11_and_sysret:
+	popq	%rsi	/* skip r11 */
+	popq	%r10
+	popq	%r9
+	popq	%r8
+	popq	%rax
+	popq	%rsi	/* skip rcx */
+	popq	%rdx
+	popq	%rsi
+	popq	%rdi
+	movq	RSP-ORIG_RAX(%rsp), %rsp
 	USERGS_SYSRET64
-
-opportunistic_sysret_failed:
-	SWAPGS
-	jmp	restore_c_regs_and_iret
 END(entry_SYSCALL_64)
 
 ENTRY(stub_ptregs_64)
@@ -423,8 +429,7 @@ ENTRY(ret_from_fork)
 	movq	%rsp, %rdi
 	call	syscall_return_slowpath	/* returns with IRQs disabled */
 	TRACE_IRQS_ON			/* user mode is traced as IRQS on */
-	SWAPGS
-	jmp	restore_regs_and_iret
+	jmp	swapgs_restore_regs_and_return_to_usermode
 
 1:
 	/* kernel thread */
@@ -612,8 +617,21 @@ GLOBAL(retint_user)
 	mov	%rsp,%rdi
 	call	prepare_exit_to_usermode
 	TRACE_IRQS_IRETQ
+
+GLOBAL(swapgs_restore_regs_and_return_to_usermode)
+#ifdef CONFIG_DEBUG_ENTRY
+	/* Assert that pt_regs indicates user mode. */
+	testb	$3, CS(%rsp)
+	jnz	1f
+	ud2
+1:
+#endif
 	SWAPGS
-	jmp	restore_regs_and_iret
+	POP_EXTRA_REGS
+	POP_C_REGS
+	addq	$8, %rsp	/* skip regs->orig_ax */
+	INTERRUPT_RETURN
+
 
 /* Returning to kernel space */
 retint_kernel:
@@ -633,15 +651,17 @@ retint_kernel:
 	 */
 	TRACE_IRQS_IRETQ
 
-/*
- * At this label, code paths which return to kernel and to user,
- * which come from interrupts/exception and from syscalls, merge.
- */
-GLOBAL(restore_regs_and_iret)
-	RESTORE_EXTRA_REGS
-restore_c_regs_and_iret:
-	RESTORE_C_REGS
-	REMOVE_PT_GPREGS_FROM_STACK 8
+GLOBAL(restore_regs_and_return_to_kernel)
+#ifdef CONFIG_DEBUG_ENTRY
+	/* Assert that pt_regs indicates kernel mode. */
+	testb	$3, CS(%rsp)
+	jz	1f
+	ud2
+1:
+#endif
+	POP_EXTRA_REGS
+	POP_C_REGS
+	addq	$8, %rsp	/* skip regs->orig_ax */
 	INTERRUPT_RETURN
 
 ENTRY(native_iret)
@@ -818,7 +838,7 @@ ENTRY(\sym)
 
 	ASM_CLAC
 
-	.ifeq \has_error_code
+	.if \has_error_code == 0
 	pushq	$-1				/* ORIG_RAX: no syscall to restart */
 	.endif
 
@@ -1059,6 +1079,7 @@ idtentry int3			do_int3			has_error_code=0	paranoid=1 shift_ist=DEBUG_STACK
 idtentry stack_segment		do_stack_segment	has_error_code=1
 
 #ifdef CONFIG_XEN
+idtentry xennmi			do_nmi			has_error_code=0
 idtentry xendebug		do_debug		has_error_code=0
 idtentry xenint3		do_int3			has_error_code=0
 #endif
@@ -1112,17 +1133,14 @@ ENTRY(paranoid_exit)
 	DISABLE_INTERRUPTS(CLBR_ANY)
 	TRACE_IRQS_OFF_DEBUG
 	testl	%ebx, %ebx			/* swapgs needed? */
-	jnz	paranoid_exit_no_swapgs
+	jnz	.Lparanoid_exit_no_swapgs
 	TRACE_IRQS_IRETQ
 	SWAPGS_UNSAFE_STACK
-	jmp	paranoid_exit_restore
-paranoid_exit_no_swapgs:
+	jmp	.Lparanoid_exit_restore
+.Lparanoid_exit_no_swapgs:
 	TRACE_IRQS_IRETQ_DEBUG
-paranoid_exit_restore:
-	RESTORE_EXTRA_REGS
-	RESTORE_C_REGS
-	REMOVE_PT_GPREGS_FROM_STACK 8
-	INTERRUPT_RETURN
+.Lparanoid_exit_restore:
+	jmp restore_regs_and_return_to_kernel
 END(paranoid_exit)
 
 /*
@@ -1223,10 +1241,13 @@ ENTRY(error_exit)
 	jmp	retint_user
 END(error_exit)
 
-/* Runs on exception stack */
-/* XXX: broken on Xen PV */
+/*
+ * Runs on exception stack.  Xen PV does not go through this path at all,
+ * so we can use real assembly here.
+ */
 ENTRY(nmi)
 	UNWIND_HINT_IRET_REGS
+
 	/*
 	 * We allow breakpoints in NMIs. If a breakpoint occurs, then
 	 * the iretq it performs will take us out of NMI context.
@@ -1284,7 +1305,7 @@ ENTRY(nmi)
 	 * stacks lest we corrupt the "NMI executing" variable.
 	 */
 
-	SWAPGS_UNSAFE_STACK
+	swapgs
 	cld
 	movq	%rsp, %rdx
 	movq	PER_CPU_VAR(cpu_current_top_of_stack), %rsp
@@ -1328,8 +1349,7 @@ ENTRY(nmi)
 	 * Return back to user mode.  We must *not* do the normal exit
 	 * work, because we don't want to enable interrupts.
 	 */
-	SWAPGS
-	jmp	restore_regs_and_iret
+	jmp	swapgs_restore_regs_and_return_to_usermode
 
 .Lnmi_from_kernel:
 	/*
@@ -1450,7 +1470,7 @@ nested_nmi_out:
 	popq	%rdx
 
 	/* We are returning to kernel mode, so this cannot result in a fault. */
-	INTERRUPT_RETURN
+	iretq
 
 first_nmi:
 	/* Restore rdx. */
@@ -1481,7 +1501,7 @@ first_nmi:
 	pushfq			/* RFLAGS */
 	pushq	$__KERNEL_CS	/* CS */
 	pushq	$1f		/* RIP */
-	INTERRUPT_RETURN	/* continues at repeat_nmi below */
+	iretq			/* continues at repeat_nmi below */
 	UNWIND_HINT_IRET_REGS
 1:
 #endif
@@ -1544,29 +1564,34 @@ end_repeat_nmi:
 nmi_swapgs:
 	SWAPGS_UNSAFE_STACK
 nmi_restore:
-	RESTORE_EXTRA_REGS
-	RESTORE_C_REGS
+	POP_EXTRA_REGS
+	POP_C_REGS
 
-	/* Point RSP at the "iret" frame. */
-	REMOVE_PT_GPREGS_FROM_STACK 6*8
+	/*
+	 * Skip orig_ax and the "outermost" frame to point RSP at the "iret"
+	 * at the "iret" frame.
+	 */
+	addq	$6*8, %rsp
 
 	/*
 	 * Clear "NMI executing".  Set DF first so that we can easily
 	 * distinguish the remaining code between here and IRET from
-	 * the SYSCALL entry and exit paths.  On a native kernel, we
-	 * could just inspect RIP, but, on paravirt kernels,
-	 * INTERRUPT_RETURN can translate into a jump into a
-	 * hypercall page.
+	 * the SYSCALL entry and exit paths.
+	 *
+	 * We arguably should just inspect RIP instead, but I (Andy) wrote
+	 * this code when I had the misapprehension that Xen PV supported
+	 * NMIs, and Xen PV would break that approach.
 	 */
 	std
 	movq	$0, 5*8(%rsp)		/* clear "NMI executing" */
 
 	/*
-	 * INTERRUPT_RETURN reads the "iret" frame and exits the NMI
-	 * stack in a single instruction.  We are returning to kernel
-	 * mode, so this cannot result in a fault.
+	 * iretq reads the "iret" frame and exits the NMI stack in a
+	 * single instruction.  We are returning to kernel mode, so this
+	 * cannot result in a fault.  Similarly, we don't need to worry
+	 * about espfix64 on the way back to kernel mode.
 	 */
-	INTERRUPT_RETURN
+	iretq
 END(nmi)
 
 ENTRY(ignore_sysret)
diff --git a/arch/x86/entry/entry_64_compat.S b/arch/x86/entry/entry_64_compat.S
index b5c7a56ed256..568e130d932c 100644
--- a/arch/x86/entry/entry_64_compat.S
+++ b/arch/x86/entry/entry_64_compat.S
@@ -337,8 +337,7 @@ ENTRY(entry_INT80_compat)
 
 	/* Go back to user mode. */
 	TRACE_IRQS_ON
-	SWAPGS
-	jmp	restore_regs_and_iret
+	jmp	swapgs_restore_regs_and_return_to_usermode
 END(entry_INT80_compat)
 
 ENTRY(stub32_clone)
diff --git a/arch/x86/entry/syscalls/Makefile b/arch/x86/entry/syscalls/Makefile
index 331f1dca5085..6fb9b57ed5ba 100644
--- a/arch/x86/entry/syscalls/Makefile
+++ b/arch/x86/entry/syscalls/Makefile
@@ -1,6 +1,6 @@
 # SPDX-License-Identifier: GPL-2.0
-out := $(obj)/../../include/generated/asm
-uapi := $(obj)/../../include/generated/uapi/asm
+out := arch/$(SRCARCH)/include/generated/asm
+uapi := arch/$(SRCARCH)/include/generated/uapi/asm
 
 # Create output directory if not already present
 _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \
diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c
index 1911310959f8..d63053142b16 100644
--- a/arch/x86/entry/vdso/vma.c
+++ b/arch/x86/entry/vdso/vma.c
@@ -114,10 +114,11 @@ static int vvar_fault(const struct vm_special_mapping *sm,
 		struct pvclock_vsyscall_time_info *pvti =
 			pvclock_pvti_cpu0_va();
 		if (pvti && vclock_was_used(VCLOCK_PVCLOCK)) {
-			ret = vm_insert_pfn(
+			ret = vm_insert_pfn_prot(
 				vma,
 				vmf->address,
-				__pa(pvti) >> PAGE_SHIFT);
+				__pa(pvti) >> PAGE_SHIFT,
+				pgprot_decrypted(vma->vm_page_prot));
 		}
 	} else if (sym_offset == image->sym_hvclock_page) {
 		struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page();
diff --git a/arch/x86/include/asm/archrandom.h b/arch/x86/include/asm/archrandom.h
index 5b0579abb398..3ac991d81e74 100644
--- a/arch/x86/include/asm/archrandom.h
+++ b/arch/x86/include/asm/archrandom.h
@@ -45,7 +45,7 @@ static inline bool rdrand_long(unsigned long *v)
 	bool ok;
 	unsigned int retry = RDRAND_RETRY_LOOPS;
 	do {
-		asm volatile(RDRAND_LONG "\n\t"
+		asm volatile(RDRAND_LONG
 			     CC_SET(c)
 			     : CC_OUT(c) (ok), "=a" (*v));
 		if (ok)
@@ -59,7 +59,7 @@ static inline bool rdrand_int(unsigned int *v)
 	bool ok;
 	unsigned int retry = RDRAND_RETRY_LOOPS;
 	do {
-		asm volatile(RDRAND_INT "\n\t"
+		asm volatile(RDRAND_INT
 			     CC_SET(c)
 			     : CC_OUT(c) (ok), "=a" (*v));
 		if (ok)
@@ -71,7 +71,7 @@ static inline bool rdrand_int(unsigned int *v)
 static inline bool rdseed_long(unsigned long *v)
 {
 	bool ok;
-	asm volatile(RDSEED_LONG "\n\t"
+	asm volatile(RDSEED_LONG
 		     CC_SET(c)
 		     : CC_OUT(c) (ok), "=a" (*v));
 	return ok;
@@ -80,7 +80,7 @@ static inline bool rdseed_long(unsigned long *v)
 static inline bool rdseed_int(unsigned int *v)
 {
 	bool ok;
-	asm volatile(RDSEED_INT "\n\t"
+	asm volatile(RDSEED_INT
 		     CC_SET(c)
 		     : CC_OUT(c) (ok), "=a" (*v));
 	return ok;
diff --git a/arch/x86/include/asm/bitops.h b/arch/x86/include/asm/bitops.h
index 2bcf47314959..3fa039855b8f 100644
--- a/arch/x86/include/asm/bitops.h
+++ b/arch/x86/include/asm/bitops.h
@@ -143,7 +143,7 @@ static __always_inline void __clear_bit(long nr, volatile unsigned long *addr)
 static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr)
 {
 	bool negative;
-	asm volatile(LOCK_PREFIX "andb %2,%1\n\t"
+	asm volatile(LOCK_PREFIX "andb %2,%1"
 		CC_SET(s)
 		: CC_OUT(s) (negative), ADDR
 		: "ir" ((char) ~(1 << nr)) : "memory");
@@ -246,7 +246,7 @@ static __always_inline bool __test_and_set_bit(long nr, volatile unsigned long *
 {
 	bool oldbit;
 
-	asm("bts %2,%1\n\t"
+	asm("bts %2,%1"
 	    CC_SET(c)
 	    : CC_OUT(c) (oldbit), ADDR
 	    : "Ir" (nr));
@@ -286,7 +286,7 @@ static __always_inline bool __test_and_clear_bit(long nr, volatile unsigned long
 {
 	bool oldbit;
 
-	asm volatile("btr %2,%1\n\t"
+	asm volatile("btr %2,%1"
 		     CC_SET(c)
 		     : CC_OUT(c) (oldbit), ADDR
 		     : "Ir" (nr));
@@ -298,7 +298,7 @@ static __always_inline bool __test_and_change_bit(long nr, volatile unsigned lon
 {
 	bool oldbit;
 
-	asm volatile("btc %2,%1\n\t"
+	asm volatile("btc %2,%1"
 		     CC_SET(c)
 		     : CC_OUT(c) (oldbit), ADDR
 		     : "Ir" (nr) : "memory");
@@ -329,7 +329,7 @@ static __always_inline bool variable_test_bit(long nr, volatile const unsigned l
 {
 	bool oldbit;
 
-	asm volatile("bt %2,%1\n\t"
+	asm volatile("bt %2,%1"
 		     CC_SET(c)
 		     : CC_OUT(c) (oldbit)
 		     : "m" (*(unsigned long *)addr), "Ir" (nr));
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 9eef9cc64c68..a600a6cda9ec 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -7,6 +7,7 @@
  */
 #include <linux/types.h>
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <asm/processor.h>
 #include <asm/user32.h>
 #include <asm/unistd.h>
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 0dfa68438e80..bf6a76202a77 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -126,11 +126,10 @@ extern const char * const x86_bug_flags[NBUGINTS*32];
 #define boot_cpu_has(bit)	cpu_has(&boot_cpu_data, bit)
 
 #define set_cpu_cap(c, bit)	set_bit(bit, (unsigned long *)((c)->x86_capability))
-#define clear_cpu_cap(c, bit)	clear_bit(bit, (unsigned long *)((c)->x86_capability))
-#define setup_clear_cpu_cap(bit) do { \
-	clear_cpu_cap(&boot_cpu_data, bit);	\
-	set_bit(bit, (unsigned long *)cpu_caps_cleared); \
-} while (0)
+
+extern void setup_clear_cpu_cap(unsigned int bit);
+extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit);
+
 #define setup_force_cpu_cap(bit) do { \
 	set_cpu_cap(&boot_cpu_data, bit);	\
 	set_bit(bit, (unsigned long *)cpu_caps_set);	\
diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h
index 793690fbda36..c0b0e9e8aa66 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -13,173 +13,176 @@
 /*
  * Defines x86 CPU feature bits
  */
-#define NCAPINTS	18	/* N 32-bit words worth of info */
-#define NBUGINTS	1	/* N 32-bit bug flags */
+#define NCAPINTS			18	   /* N 32-bit words worth of info */
+#define NBUGINTS			1	   /* N 32-bit bug flags */
 
 /*
  * Note: If the comment begins with a quoted string, that string is used
  * in /proc/cpuinfo instead of the macro name.  If the string is "",
  * this feature bit is not displayed in /proc/cpuinfo at all.
+ *
+ * When adding new features here that depend on other features,
+ * please update the table in kernel/cpu/cpuid-deps.c as well.
  */
 
-/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */
-#define X86_FEATURE_FPU		( 0*32+ 0) /* Onboard FPU */
-#define X86_FEATURE_VME		( 0*32+ 1) /* Virtual Mode Extensions */
-#define X86_FEATURE_DE		( 0*32+ 2) /* Debugging Extensions */
-#define X86_FEATURE_PSE		( 0*32+ 3) /* Page Size Extensions */
-#define X86_FEATURE_TSC		( 0*32+ 4) /* Time Stamp Counter */
-#define X86_FEATURE_MSR		( 0*32+ 5) /* Model-Specific Registers */
-#define X86_FEATURE_PAE		( 0*32+ 6) /* Physical Address Extensions */
-#define X86_FEATURE_MCE		( 0*32+ 7) /* Machine Check Exception */
-#define X86_FEATURE_CX8		( 0*32+ 8) /* CMPXCHG8 instruction */
-#define X86_FEATURE_APIC	( 0*32+ 9) /* Onboard APIC */
-#define X86_FEATURE_SEP		( 0*32+11) /* SYSENTER/SYSEXIT */
-#define X86_FEATURE_MTRR	( 0*32+12) /* Memory Type Range Registers */
-#define X86_FEATURE_PGE		( 0*32+13) /* Page Global Enable */
-#define X86_FEATURE_MCA		( 0*32+14) /* Machine Check Architecture */
-#define X86_FEATURE_CMOV	( 0*32+15) /* CMOV instructions */
-					  /* (plus FCMOVcc, FCOMI with FPU) */
-#define X86_FEATURE_PAT		( 0*32+16) /* Page Attribute Table */
-#define X86_FEATURE_PSE36	( 0*32+17) /* 36-bit PSEs */
-#define X86_FEATURE_PN		( 0*32+18) /* Processor serial number */
-#define X86_FEATURE_CLFLUSH	( 0*32+19) /* CLFLUSH instruction */
-#define X86_FEATURE_DS		( 0*32+21) /* "dts" Debug Store */
-#define X86_FEATURE_ACPI	( 0*32+22) /* ACPI via MSR */
-#define X86_FEATURE_MMX		( 0*32+23) /* Multimedia Extensions */
-#define X86_FEATURE_FXSR	( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
-#define X86_FEATURE_XMM		( 0*32+25) /* "sse" */
-#define X86_FEATURE_XMM2	( 0*32+26) /* "sse2" */
-#define X86_FEATURE_SELFSNOOP	( 0*32+27) /* "ss" CPU self snoop */
-#define X86_FEATURE_HT		( 0*32+28) /* Hyper-Threading */
-#define X86_FEATURE_ACC		( 0*32+29) /* "tm" Automatic clock control */
-#define X86_FEATURE_IA64	( 0*32+30) /* IA-64 processor */
-#define X86_FEATURE_PBE		( 0*32+31) /* Pending Break Enable */
+/* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */
+#define X86_FEATURE_FPU			( 0*32+ 0) /* Onboard FPU */
+#define X86_FEATURE_VME			( 0*32+ 1) /* Virtual Mode Extensions */
+#define X86_FEATURE_DE			( 0*32+ 2) /* Debugging Extensions */
+#define X86_FEATURE_PSE			( 0*32+ 3) /* Page Size Extensions */
+#define X86_FEATURE_TSC			( 0*32+ 4) /* Time Stamp Counter */
+#define X86_FEATURE_MSR			( 0*32+ 5) /* Model-Specific Registers */
+#define X86_FEATURE_PAE			( 0*32+ 6) /* Physical Address Extensions */
+#define X86_FEATURE_MCE			( 0*32+ 7) /* Machine Check Exception */
+#define X86_FEATURE_CX8			( 0*32+ 8) /* CMPXCHG8 instruction */
+#define X86_FEATURE_APIC		( 0*32+ 9) /* Onboard APIC */
+#define X86_FEATURE_SEP			( 0*32+11) /* SYSENTER/SYSEXIT */
+#define X86_FEATURE_MTRR		( 0*32+12) /* Memory Type Range Registers */
+#define X86_FEATURE_PGE			( 0*32+13) /* Page Global Enable */
+#define X86_FEATURE_MCA			( 0*32+14) /* Machine Check Architecture */
+#define X86_FEATURE_CMOV		( 0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */
+#define X86_FEATURE_PAT			( 0*32+16) /* Page Attribute Table */
+#define X86_FEATURE_PSE36		( 0*32+17) /* 36-bit PSEs */
+#define X86_FEATURE_PN			( 0*32+18) /* Processor serial number */
+#define X86_FEATURE_CLFLUSH		( 0*32+19) /* CLFLUSH instruction */
+#define X86_FEATURE_DS			( 0*32+21) /* "dts" Debug Store */
+#define X86_FEATURE_ACPI		( 0*32+22) /* ACPI via MSR */
+#define X86_FEATURE_MMX			( 0*32+23) /* Multimedia Extensions */
+#define X86_FEATURE_FXSR		( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */
+#define X86_FEATURE_XMM			( 0*32+25) /* "sse" */
+#define X86_FEATURE_XMM2		( 0*32+26) /* "sse2" */
+#define X86_FEATURE_SELFSNOOP		( 0*32+27) /* "ss" CPU self snoop */
+#define X86_FEATURE_HT			( 0*32+28) /* Hyper-Threading */
+#define X86_FEATURE_ACC			( 0*32+29) /* "tm" Automatic clock control */
+#define X86_FEATURE_IA64		( 0*32+30) /* IA-64 processor */
+#define X86_FEATURE_PBE			( 0*32+31) /* Pending Break Enable */
 
 /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */
 /* Don't duplicate feature flags which are redundant with Intel! */
-#define X86_FEATURE_SYSCALL	( 1*32+11) /* SYSCALL/SYSRET */
-#define X86_FEATURE_MP		( 1*32+19) /* MP Capable. */
-#define X86_FEATURE_NX		( 1*32+20) /* Execute Disable */
-#define X86_FEATURE_MMXEXT	( 1*32+22) /* AMD MMX extensions */
-#define X86_FEATURE_FXSR_OPT	( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
-#define X86_FEATURE_GBPAGES	( 1*32+26) /* "pdpe1gb" GB pages */
-#define X86_FEATURE_RDTSCP	( 1*32+27) /* RDTSCP */
-#define X86_FEATURE_LM		( 1*32+29) /* Long Mode (x86-64) */
-#define X86_FEATURE_3DNOWEXT	( 1*32+30) /* AMD 3DNow! extensions */
-#define X86_FEATURE_3DNOW	( 1*32+31) /* 3DNow! */
+#define X86_FEATURE_SYSCALL		( 1*32+11) /* SYSCALL/SYSRET */
+#define X86_FEATURE_MP			( 1*32+19) /* MP Capable */
+#define X86_FEATURE_NX			( 1*32+20) /* Execute Disable */
+#define X86_FEATURE_MMXEXT		( 1*32+22) /* AMD MMX extensions */
+#define X86_FEATURE_FXSR_OPT		( 1*32+25) /* FXSAVE/FXRSTOR optimizations */
+#define X86_FEATURE_GBPAGES		( 1*32+26) /* "pdpe1gb" GB pages */
+#define X86_FEATURE_RDTSCP		( 1*32+27) /* RDTSCP */
+#define X86_FEATURE_LM			( 1*32+29) /* Long Mode (x86-64, 64-bit support) */
+#define X86_FEATURE_3DNOWEXT		( 1*32+30) /* AMD 3DNow extensions */
+#define X86_FEATURE_3DNOW		( 1*32+31) /* 3DNow */
 
 /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */
-#define X86_FEATURE_RECOVERY	( 2*32+ 0) /* CPU in recovery mode */
-#define X86_FEATURE_LONGRUN	( 2*32+ 1) /* Longrun power control */
-#define X86_FEATURE_LRTI	( 2*32+ 3) /* LongRun table interface */
+#define X86_FEATURE_RECOVERY		( 2*32+ 0) /* CPU in recovery mode */
+#define X86_FEATURE_LONGRUN		( 2*32+ 1) /* Longrun power control */
+#define X86_FEATURE_LRTI		( 2*32+ 3) /* LongRun table interface */
 
 /* Other features, Linux-defined mapping, word 3 */
 /* This range is used for feature bits which conflict or are synthesized */
-#define X86_FEATURE_CXMMX	( 3*32+ 0) /* Cyrix MMX extensions */
-#define X86_FEATURE_K6_MTRR	( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
-#define X86_FEATURE_CYRIX_ARR	( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
-#define X86_FEATURE_CENTAUR_MCR	( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
-/* cpu types for specific tunings: */
-#define X86_FEATURE_K8		( 3*32+ 4) /* "" Opteron, Athlon64 */
-#define X86_FEATURE_K7		( 3*32+ 5) /* "" Athlon */
-#define X86_FEATURE_P3		( 3*32+ 6) /* "" P3 */
-#define X86_FEATURE_P4		( 3*32+ 7) /* "" P4 */
-#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */
-#define X86_FEATURE_UP		( 3*32+ 9) /* smp kernel running on up */
-#define X86_FEATURE_ART		( 3*32+10) /* Platform has always running timer (ART) */
-#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */
-#define X86_FEATURE_PEBS	( 3*32+12) /* Precise-Event Based Sampling */
-#define X86_FEATURE_BTS		( 3*32+13) /* Branch Trace Store */
-#define X86_FEATURE_SYSCALL32	( 3*32+14) /* "" syscall in ia32 userspace */
-#define X86_FEATURE_SYSENTER32	( 3*32+15) /* "" sysenter in ia32 userspace */
-#define X86_FEATURE_REP_GOOD	( 3*32+16) /* rep microcode works well */
-#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */
-#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */
-#define X86_FEATURE_ACC_POWER	( 3*32+19) /* AMD Accumulated Power Mechanism */
-#define X86_FEATURE_NOPL	( 3*32+20) /* The NOPL (0F 1F) instructions */
-#define X86_FEATURE_ALWAYS	( 3*32+21) /* "" Always-present feature */
-#define X86_FEATURE_XTOPOLOGY	( 3*32+22) /* cpu topology enum extensions */
-#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */
-#define X86_FEATURE_NONSTOP_TSC	( 3*32+24) /* TSC does not stop in C states */
-#define X86_FEATURE_CPUID	( 3*32+25) /* CPU has CPUID instruction itself */
-#define X86_FEATURE_EXTD_APICID	( 3*32+26) /* has extended APICID (8 bits) */
-#define X86_FEATURE_AMD_DCM     ( 3*32+27) /* multi-node processor */
-#define X86_FEATURE_APERFMPERF	( 3*32+28) /* APERFMPERF */
-#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */
-#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */
+#define X86_FEATURE_CXMMX		( 3*32+ 0) /* Cyrix MMX extensions */
+#define X86_FEATURE_K6_MTRR		( 3*32+ 1) /* AMD K6 nonstandard MTRRs */
+#define X86_FEATURE_CYRIX_ARR		( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */
+#define X86_FEATURE_CENTAUR_MCR		( 3*32+ 3) /* Centaur MCRs (= MTRRs) */
+
+/* CPU types for specific tunings: */
+#define X86_FEATURE_K8			( 3*32+ 4) /* "" Opteron, Athlon64 */
+#define X86_FEATURE_K7			( 3*32+ 5) /* "" Athlon */
+#define X86_FEATURE_P3			( 3*32+ 6) /* "" P3 */
+#define X86_FEATURE_P4			( 3*32+ 7) /* "" P4 */
+#define X86_FEATURE_CONSTANT_TSC	( 3*32+ 8) /* TSC ticks at a constant rate */
+#define X86_FEATURE_UP			( 3*32+ 9) /* SMP kernel running on UP */
+#define X86_FEATURE_ART			( 3*32+10) /* Always running timer (ART) */
+#define X86_FEATURE_ARCH_PERFMON	( 3*32+11) /* Intel Architectural PerfMon */
+#define X86_FEATURE_PEBS		( 3*32+12) /* Precise-Event Based Sampling */
+#define X86_FEATURE_BTS			( 3*32+13) /* Branch Trace Store */
+#define X86_FEATURE_SYSCALL32		( 3*32+14) /* "" syscall in IA32 userspace */
+#define X86_FEATURE_SYSENTER32		( 3*32+15) /* "" sysenter in IA32 userspace */
+#define X86_FEATURE_REP_GOOD		( 3*32+16) /* REP microcode works well */
+#define X86_FEATURE_MFENCE_RDTSC	( 3*32+17) /* "" MFENCE synchronizes RDTSC */
+#define X86_FEATURE_LFENCE_RDTSC	( 3*32+18) /* "" LFENCE synchronizes RDTSC */
+#define X86_FEATURE_ACC_POWER		( 3*32+19) /* AMD Accumulated Power Mechanism */
+#define X86_FEATURE_NOPL		( 3*32+20) /* The NOPL (0F 1F) instructions */
+#define X86_FEATURE_ALWAYS		( 3*32+21) /* "" Always-present feature */
+#define X86_FEATURE_XTOPOLOGY		( 3*32+22) /* CPU topology enum extensions */
+#define X86_FEATURE_TSC_RELIABLE	( 3*32+23) /* TSC is known to be reliable */
+#define X86_FEATURE_NONSTOP_TSC		( 3*32+24) /* TSC does not stop in C states */
+#define X86_FEATURE_CPUID		( 3*32+25) /* CPU has CPUID instruction itself */
+#define X86_FEATURE_EXTD_APICID		( 3*32+26) /* Extended APICID (8 bits) */
+#define X86_FEATURE_AMD_DCM		( 3*32+27) /* AMD multi-node processor */
+#define X86_FEATURE_APERFMPERF		( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */
+#define X86_FEATURE_NONSTOP_TSC_S3	( 3*32+30) /* TSC doesn't stop in S3 state */
+#define X86_FEATURE_TSC_KNOWN_FREQ	( 3*32+31) /* TSC has known frequency */
 
-/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */
-#define X86_FEATURE_XMM3	( 4*32+ 0) /* "pni" SSE-3 */
-#define X86_FEATURE_PCLMULQDQ	( 4*32+ 1) /* PCLMULQDQ instruction */
-#define X86_FEATURE_DTES64	( 4*32+ 2) /* 64-bit Debug Store */
-#define X86_FEATURE_MWAIT	( 4*32+ 3) /* "monitor" Monitor/Mwait support */
-#define X86_FEATURE_DSCPL	( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */
-#define X86_FEATURE_VMX		( 4*32+ 5) /* Hardware virtualization */
-#define X86_FEATURE_SMX		( 4*32+ 6) /* Safer mode */
-#define X86_FEATURE_EST		( 4*32+ 7) /* Enhanced SpeedStep */
-#define X86_FEATURE_TM2		( 4*32+ 8) /* Thermal Monitor 2 */
-#define X86_FEATURE_SSSE3	( 4*32+ 9) /* Supplemental SSE-3 */
-#define X86_FEATURE_CID		( 4*32+10) /* Context ID */
-#define X86_FEATURE_SDBG	( 4*32+11) /* Silicon Debug */
-#define X86_FEATURE_FMA		( 4*32+12) /* Fused multiply-add */
-#define X86_FEATURE_CX16	( 4*32+13) /* CMPXCHG16B */
-#define X86_FEATURE_XTPR	( 4*32+14) /* Send Task Priority Messages */
-#define X86_FEATURE_PDCM	( 4*32+15) /* Performance Capabilities */
-#define X86_FEATURE_PCID	( 4*32+17) /* Process Context Identifiers */
-#define X86_FEATURE_DCA		( 4*32+18) /* Direct Cache Access */
-#define X86_FEATURE_XMM4_1	( 4*32+19) /* "sse4_1" SSE-4.1 */
-#define X86_FEATURE_XMM4_2	( 4*32+20) /* "sse4_2" SSE-4.2 */
-#define X86_FEATURE_X2APIC	( 4*32+21) /* x2APIC */
-#define X86_FEATURE_MOVBE	( 4*32+22) /* MOVBE instruction */
-#define X86_FEATURE_POPCNT      ( 4*32+23) /* POPCNT instruction */
-#define X86_FEATURE_TSC_DEADLINE_TIMER	( 4*32+24) /* Tsc deadline timer */
-#define X86_FEATURE_AES		( 4*32+25) /* AES instructions */
-#define X86_FEATURE_XSAVE	( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */
-#define X86_FEATURE_OSXSAVE	( 4*32+27) /* "" XSAVE enabled in the OS */
-#define X86_FEATURE_AVX		( 4*32+28) /* Advanced Vector Extensions */
-#define X86_FEATURE_F16C	( 4*32+29) /* 16-bit fp conversions */
-#define X86_FEATURE_RDRAND	( 4*32+30) /* The RDRAND instruction */
-#define X86_FEATURE_HYPERVISOR	( 4*32+31) /* Running on a hypervisor */
+/* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */
+#define X86_FEATURE_XMM3		( 4*32+ 0) /* "pni" SSE-3 */
+#define X86_FEATURE_PCLMULQDQ		( 4*32+ 1) /* PCLMULQDQ instruction */
+#define X86_FEATURE_DTES64		( 4*32+ 2) /* 64-bit Debug Store */
+#define X86_FEATURE_MWAIT		( 4*32+ 3) /* "monitor" MONITOR/MWAIT support */
+#define X86_FEATURE_DSCPL		( 4*32+ 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */
+#define X86_FEATURE_VMX			( 4*32+ 5) /* Hardware virtualization */
+#define X86_FEATURE_SMX			( 4*32+ 6) /* Safer Mode eXtensions */
+#define X86_FEATURE_EST			( 4*32+ 7) /* Enhanced SpeedStep */
+#define X86_FEATURE_TM2			( 4*32+ 8) /* Thermal Monitor 2 */
+#define X86_FEATURE_SSSE3		( 4*32+ 9) /* Supplemental SSE-3 */
+#define X86_FEATURE_CID			( 4*32+10) /* Context ID */
+#define X86_FEATURE_SDBG		( 4*32+11) /* Silicon Debug */
+#define X86_FEATURE_FMA			( 4*32+12) /* Fused multiply-add */
+#define X86_FEATURE_CX16		( 4*32+13) /* CMPXCHG16B instruction */
+#define X86_FEATURE_XTPR		( 4*32+14) /* Send Task Priority Messages */
+#define X86_FEATURE_PDCM		( 4*32+15) /* Perf/Debug Capabilities MSR */
+#define X86_FEATURE_PCID		( 4*32+17) /* Process Context Identifiers */
+#define X86_FEATURE_DCA			( 4*32+18) /* Direct Cache Access */
+#define X86_FEATURE_XMM4_1		( 4*32+19) /* "sse4_1" SSE-4.1 */
+#define X86_FEATURE_XMM4_2		( 4*32+20) /* "sse4_2" SSE-4.2 */
+#define X86_FEATURE_X2APIC		( 4*32+21) /* X2APIC */
+#define X86_FEATURE_MOVBE		( 4*32+22) /* MOVBE instruction */
+#define X86_FEATURE_POPCNT		( 4*32+23) /* POPCNT instruction */
+#define X86_FEATURE_TSC_DEADLINE_TIMER	( 4*32+24) /* TSC deadline timer */
+#define X86_FEATURE_AES			( 4*32+25) /* AES instructions */
+#define X86_FEATURE_XSAVE		( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */
+#define X86_FEATURE_OSXSAVE		( 4*32+27) /* "" XSAVE instruction enabled in the OS */
+#define X86_FEATURE_AVX			( 4*32+28) /* Advanced Vector Extensions */
+#define X86_FEATURE_F16C		( 4*32+29) /* 16-bit FP conversions */
+#define X86_FEATURE_RDRAND		( 4*32+30) /* RDRAND instruction */
+#define X86_FEATURE_HYPERVISOR		( 4*32+31) /* Running on a hypervisor */
 
 /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */
-#define X86_FEATURE_XSTORE	( 5*32+ 2) /* "rng" RNG present (xstore) */
-#define X86_FEATURE_XSTORE_EN	( 5*32+ 3) /* "rng_en" RNG enabled */
-#define X86_FEATURE_XCRYPT	( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
-#define X86_FEATURE_XCRYPT_EN	( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
-#define X86_FEATURE_ACE2	( 5*32+ 8) /* Advanced Cryptography Engine v2 */
-#define X86_FEATURE_ACE2_EN	( 5*32+ 9) /* ACE v2 enabled */
-#define X86_FEATURE_PHE		( 5*32+10) /* PadLock Hash Engine */
-#define X86_FEATURE_PHE_EN	( 5*32+11) /* PHE enabled */
-#define X86_FEATURE_PMM		( 5*32+12) /* PadLock Montgomery Multiplier */
-#define X86_FEATURE_PMM_EN	( 5*32+13) /* PMM enabled */
+#define X86_FEATURE_XSTORE		( 5*32+ 2) /* "rng" RNG present (xstore) */
+#define X86_FEATURE_XSTORE_EN		( 5*32+ 3) /* "rng_en" RNG enabled */
+#define X86_FEATURE_XCRYPT		( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */
+#define X86_FEATURE_XCRYPT_EN		( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */
+#define X86_FEATURE_ACE2		( 5*32+ 8) /* Advanced Cryptography Engine v2 */
+#define X86_FEATURE_ACE2_EN		( 5*32+ 9) /* ACE v2 enabled */
+#define X86_FEATURE_PHE			( 5*32+10) /* PadLock Hash Engine */
+#define X86_FEATURE_PHE_EN		( 5*32+11) /* PHE enabled */
+#define X86_FEATURE_PMM			( 5*32+12) /* PadLock Montgomery Multiplier */
+#define X86_FEATURE_PMM_EN		( 5*32+13) /* PMM enabled */
 
-/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */
-#define X86_FEATURE_LAHF_LM	( 6*32+ 0) /* LAHF/SAHF in long mode */
-#define X86_FEATURE_CMP_LEGACY	( 6*32+ 1) /* If yes HyperThreading not valid */
-#define X86_FEATURE_SVM		( 6*32+ 2) /* Secure virtual machine */
-#define X86_FEATURE_EXTAPIC	( 6*32+ 3) /* Extended APIC space */
-#define X86_FEATURE_CR8_LEGACY	( 6*32+ 4) /* CR8 in 32-bit mode */
-#define X86_FEATURE_ABM		( 6*32+ 5) /* Advanced bit manipulation */
-#define X86_FEATURE_SSE4A	( 6*32+ 6) /* SSE-4A */
-#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */
-#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */
-#define X86_FEATURE_OSVW	( 6*32+ 9) /* OS Visible Workaround */
-#define X86_FEATURE_IBS		( 6*32+10) /* Instruction Based Sampling */
-#define X86_FEATURE_XOP		( 6*32+11) /* extended AVX instructions */
-#define X86_FEATURE_SKINIT	( 6*32+12) /* SKINIT/STGI instructions */
-#define X86_FEATURE_WDT		( 6*32+13) /* Watchdog timer */
-#define X86_FEATURE_LWP		( 6*32+15) /* Light Weight Profiling */
-#define X86_FEATURE_FMA4	( 6*32+16) /* 4 operands MAC instructions */
-#define X86_FEATURE_TCE		( 6*32+17) /* translation cache extension */
-#define X86_FEATURE_NODEID_MSR	( 6*32+19) /* NodeId MSR */
-#define X86_FEATURE_TBM		( 6*32+21) /* trailing bit manipulations */
-#define X86_FEATURE_TOPOEXT	( 6*32+22) /* topology extensions CPUID leafs */
-#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */
-#define X86_FEATURE_PERFCTR_NB  ( 6*32+24) /* NB performance counter extensions */
-#define X86_FEATURE_BPEXT	(6*32+26) /* data breakpoint extension */
-#define X86_FEATURE_PTSC	( 6*32+27) /* performance time-stamp counter */
-#define X86_FEATURE_PERFCTR_LLC	( 6*32+28) /* Last Level Cache performance counter extensions */
-#define X86_FEATURE_MWAITX	( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */
+/* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */
+#define X86_FEATURE_LAHF_LM		( 6*32+ 0) /* LAHF/SAHF in long mode */
+#define X86_FEATURE_CMP_LEGACY		( 6*32+ 1) /* If yes HyperThreading not valid */
+#define X86_FEATURE_SVM			( 6*32+ 2) /* Secure Virtual Machine */
+#define X86_FEATURE_EXTAPIC		( 6*32+ 3) /* Extended APIC space */
+#define X86_FEATURE_CR8_LEGACY		( 6*32+ 4) /* CR8 in 32-bit mode */
+#define X86_FEATURE_ABM			( 6*32+ 5) /* Advanced bit manipulation */
+#define X86_FEATURE_SSE4A		( 6*32+ 6) /* SSE-4A */
+#define X86_FEATURE_MISALIGNSSE		( 6*32+ 7) /* Misaligned SSE mode */
+#define X86_FEATURE_3DNOWPREFETCH	( 6*32+ 8) /* 3DNow prefetch instructions */
+#define X86_FEATURE_OSVW		( 6*32+ 9) /* OS Visible Workaround */
+#define X86_FEATURE_IBS			( 6*32+10) /* Instruction Based Sampling */
+#define X86_FEATURE_XOP			( 6*32+11) /* extended AVX instructions */
+#define X86_FEATURE_SKINIT		( 6*32+12) /* SKINIT/STGI instructions */
+#define X86_FEATURE_WDT			( 6*32+13) /* Watchdog timer */
+#define X86_FEATURE_LWP			( 6*32+15) /* Light Weight Profiling */
+#define X86_FEATURE_FMA4		( 6*32+16) /* 4 operands MAC instructions */
+#define X86_FEATURE_TCE			( 6*32+17) /* Translation Cache Extension */
+#define X86_FEATURE_NODEID_MSR		( 6*32+19) /* NodeId MSR */
+#define X86_FEATURE_TBM			( 6*32+21) /* Trailing Bit Manipulations */
+#define X86_FEATURE_TOPOEXT		( 6*32+22) /* Topology extensions CPUID leafs */
+#define X86_FEATURE_PERFCTR_CORE	( 6*32+23) /* Core performance counter extensions */
+#define X86_FEATURE_PERFCTR_NB		( 6*32+24) /* NB performance counter extensions */
+#define X86_FEATURE_BPEXT		( 6*32+26) /* Data breakpoint extension */
+#define X86_FEATURE_PTSC		( 6*32+27) /* Performance time-stamp counter */
+#define X86_FEATURE_PERFCTR_LLC		( 6*32+28) /* Last Level Cache performance counter extensions */
+#define X86_FEATURE_MWAITX		( 6*32+29) /* MWAIT extension (MONITORX/MWAITX instructions) */
 
 /*
  * Auxiliary flags: Linux defined - For features scattered in various
@@ -187,146 +190,154 @@
  *
  * Reuse free bits when adding new feature flags!
  */
-#define X86_FEATURE_RING3MWAIT	( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */
-#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */
-#define X86_FEATURE_CPB		( 7*32+ 2) /* AMD Core Performance Boost */
-#define X86_FEATURE_EPB		( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
-#define X86_FEATURE_CAT_L3	( 7*32+ 4) /* Cache Allocation Technology L3 */
-#define X86_FEATURE_CAT_L2	( 7*32+ 5) /* Cache Allocation Technology L2 */
-#define X86_FEATURE_CDP_L3	( 7*32+ 6) /* Code and Data Prioritization L3 */
+#define X86_FEATURE_RING3MWAIT		( 7*32+ 0) /* Ring 3 MONITOR/MWAIT instructions */
+#define X86_FEATURE_CPUID_FAULT		( 7*32+ 1) /* Intel CPUID faulting */
+#define X86_FEATURE_CPB			( 7*32+ 2) /* AMD Core Performance Boost */
+#define X86_FEATURE_EPB			( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */
+#define X86_FEATURE_CAT_L3		( 7*32+ 4) /* Cache Allocation Technology L3 */
+#define X86_FEATURE_CAT_L2		( 7*32+ 5) /* Cache Allocation Technology L2 */
+#define X86_FEATURE_CDP_L3		( 7*32+ 6) /* Code and Data Prioritization L3 */
 
-#define X86_FEATURE_HW_PSTATE	( 7*32+ 8) /* AMD HW-PState */
-#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */
-#define X86_FEATURE_SME		( 7*32+10) /* AMD Secure Memory Encryption */
+#define X86_FEATURE_HW_PSTATE		( 7*32+ 8) /* AMD HW-PState */
+#define X86_FEATURE_PROC_FEEDBACK	( 7*32+ 9) /* AMD ProcFeedbackInterface */
+#define X86_FEATURE_SME			( 7*32+10) /* AMD Secure Memory Encryption */
 
-#define X86_FEATURE_INTEL_PPIN	( 7*32+14) /* Intel Processor Inventory Number */
-#define X86_FEATURE_INTEL_PT	( 7*32+15) /* Intel Processor Trace */
-#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */
-#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */
+#define X86_FEATURE_INTEL_PPIN		( 7*32+14) /* Intel Processor Inventory Number */
+#define X86_FEATURE_INTEL_PT		( 7*32+15) /* Intel Processor Trace */
+#define X86_FEATURE_AVX512_4VNNIW	( 7*32+16) /* AVX-512 Neural Network Instructions */
+#define X86_FEATURE_AVX512_4FMAPS	( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */
 
-#define X86_FEATURE_MBA         ( 7*32+18) /* Memory Bandwidth Allocation */
+#define X86_FEATURE_MBA			( 7*32+18) /* Memory Bandwidth Allocation */
 
 /* Virtualization flags: Linux defined, word 8 */
-#define X86_FEATURE_TPR_SHADOW  ( 8*32+ 0) /* Intel TPR Shadow */
-#define X86_FEATURE_VNMI        ( 8*32+ 1) /* Intel Virtual NMI */
-#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */
-#define X86_FEATURE_EPT         ( 8*32+ 3) /* Intel Extended Page Table */
-#define X86_FEATURE_VPID        ( 8*32+ 4) /* Intel Virtual Processor ID */
+#define X86_FEATURE_TPR_SHADOW		( 8*32+ 0) /* Intel TPR Shadow */
+#define X86_FEATURE_VNMI		( 8*32+ 1) /* Intel Virtual NMI */
+#define X86_FEATURE_FLEXPRIORITY	( 8*32+ 2) /* Intel FlexPriority */
+#define X86_FEATURE_EPT			( 8*32+ 3) /* Intel Extended Page Table */
+#define X86_FEATURE_VPID		( 8*32+ 4) /* Intel Virtual Processor ID */
 
-#define X86_FEATURE_VMMCALL     ( 8*32+15) /* Prefer vmmcall to vmcall */
-#define X86_FEATURE_XENPV       ( 8*32+16) /* "" Xen paravirtual guest */
+#define X86_FEATURE_VMMCALL		( 8*32+15) /* Prefer VMMCALL to VMCALL */
+#define X86_FEATURE_XENPV		( 8*32+16) /* "" Xen paravirtual guest */
 
 
-/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */
-#define X86_FEATURE_FSGSBASE	( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/
-#define X86_FEATURE_TSC_ADJUST	( 9*32+ 1) /* TSC adjustment MSR 0x3b */
-#define X86_FEATURE_BMI1	( 9*32+ 3) /* 1st group bit manipulation extensions */
-#define X86_FEATURE_HLE		( 9*32+ 4) /* Hardware Lock Elision */
-#define X86_FEATURE_AVX2	( 9*32+ 5) /* AVX2 instructions */
-#define X86_FEATURE_SMEP	( 9*32+ 7) /* Supervisor Mode Execution Protection */
-#define X86_FEATURE_BMI2	( 9*32+ 8) /* 2nd group bit manipulation extensions */
-#define X86_FEATURE_ERMS	( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */
-#define X86_FEATURE_INVPCID	( 9*32+10) /* Invalidate Processor Context ID */
-#define X86_FEATURE_RTM		( 9*32+11) /* Restricted Transactional Memory */
-#define X86_FEATURE_CQM		( 9*32+12) /* Cache QoS Monitoring */
-#define X86_FEATURE_MPX		( 9*32+14) /* Memory Protection Extension */
-#define X86_FEATURE_RDT_A	( 9*32+15) /* Resource Director Technology Allocation */
-#define X86_FEATURE_AVX512F	( 9*32+16) /* AVX-512 Foundation */
-#define X86_FEATURE_AVX512DQ	( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
-#define X86_FEATURE_RDSEED	( 9*32+18) /* The RDSEED instruction */
-#define X86_FEATURE_ADX		( 9*32+19) /* The ADCX and ADOX instructions */
-#define X86_FEATURE_SMAP	( 9*32+20) /* Supervisor Mode Access Prevention */
-#define X86_FEATURE_AVX512IFMA  ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */
-#define X86_FEATURE_CLFLUSHOPT	( 9*32+23) /* CLFLUSHOPT instruction */
-#define X86_FEATURE_CLWB	( 9*32+24) /* CLWB instruction */
-#define X86_FEATURE_AVX512PF	( 9*32+26) /* AVX-512 Prefetch */
-#define X86_FEATURE_AVX512ER	( 9*32+27) /* AVX-512 Exponential and Reciprocal */
-#define X86_FEATURE_AVX512CD	( 9*32+28) /* AVX-512 Conflict Detection */
-#define X86_FEATURE_SHA_NI	( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
-#define X86_FEATURE_AVX512BW	( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */
-#define X86_FEATURE_AVX512VL	( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */
+#define X86_FEATURE_FSGSBASE		( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/
+#define X86_FEATURE_TSC_ADJUST		( 9*32+ 1) /* TSC adjustment MSR 0x3B */
+#define X86_FEATURE_BMI1		( 9*32+ 3) /* 1st group bit manipulation extensions */
+#define X86_FEATURE_HLE			( 9*32+ 4) /* Hardware Lock Elision */
+#define X86_FEATURE_AVX2		( 9*32+ 5) /* AVX2 instructions */
+#define X86_FEATURE_SMEP		( 9*32+ 7) /* Supervisor Mode Execution Protection */
+#define X86_FEATURE_BMI2		( 9*32+ 8) /* 2nd group bit manipulation extensions */
+#define X86_FEATURE_ERMS		( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */
+#define X86_FEATURE_INVPCID		( 9*32+10) /* Invalidate Processor Context ID */
+#define X86_FEATURE_RTM			( 9*32+11) /* Restricted Transactional Memory */
+#define X86_FEATURE_CQM			( 9*32+12) /* Cache QoS Monitoring */
+#define X86_FEATURE_MPX			( 9*32+14) /* Memory Protection Extension */
+#define X86_FEATURE_RDT_A		( 9*32+15) /* Resource Director Technology Allocation */
+#define X86_FEATURE_AVX512F		( 9*32+16) /* AVX-512 Foundation */
+#define X86_FEATURE_AVX512DQ		( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */
+#define X86_FEATURE_RDSEED		( 9*32+18) /* RDSEED instruction */
+#define X86_FEATURE_ADX			( 9*32+19) /* ADCX and ADOX instructions */
+#define X86_FEATURE_SMAP		( 9*32+20) /* Supervisor Mode Access Prevention */
+#define X86_FEATURE_AVX512IFMA		( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */
+#define X86_FEATURE_CLFLUSHOPT		( 9*32+23) /* CLFLUSHOPT instruction */
+#define X86_FEATURE_CLWB		( 9*32+24) /* CLWB instruction */
+#define X86_FEATURE_AVX512PF		( 9*32+26) /* AVX-512 Prefetch */
+#define X86_FEATURE_AVX512ER		( 9*32+27) /* AVX-512 Exponential and Reciprocal */
+#define X86_FEATURE_AVX512CD		( 9*32+28) /* AVX-512 Conflict Detection */
+#define X86_FEATURE_SHA_NI		( 9*32+29) /* SHA1/SHA256 Instruction Extensions */
+#define X86_FEATURE_AVX512BW		( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */
+#define X86_FEATURE_AVX512VL		( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */
 
-/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */
-#define X86_FEATURE_XSAVEOPT	(10*32+ 0) /* XSAVEOPT */
-#define X86_FEATURE_XSAVEC	(10*32+ 1) /* XSAVEC */
-#define X86_FEATURE_XGETBV1	(10*32+ 2) /* XGETBV with ECX = 1 */
-#define X86_FEATURE_XSAVES	(10*32+ 3) /* XSAVES/XRSTORS */
+/* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */
+#define X86_FEATURE_XSAVEOPT		(10*32+ 0) /* XSAVEOPT instruction */
+#define X86_FEATURE_XSAVEC		(10*32+ 1) /* XSAVEC instruction */
+#define X86_FEATURE_XGETBV1		(10*32+ 2) /* XGETBV with ECX = 1 instruction */
+#define X86_FEATURE_XSAVES		(10*32+ 3) /* XSAVES/XRSTORS instructions */
 
-/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */
-#define X86_FEATURE_CQM_LLC	(11*32+ 1) /* LLC QoS if 1 */
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (EDX), word 11 */
+#define X86_FEATURE_CQM_LLC		(11*32+ 1) /* LLC QoS if 1 */
 
-/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */
-#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */
-#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */
-#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */
+/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (EDX), word 12 */
+#define X86_FEATURE_CQM_OCCUP_LLC	(12*32+ 0) /* LLC occupancy monitoring */
+#define X86_FEATURE_CQM_MBM_TOTAL	(12*32+ 1) /* LLC Total MBM monitoring */
+#define X86_FEATURE_CQM_MBM_LOCAL	(12*32+ 2) /* LLC Local MBM monitoring */
 
-/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */
-#define X86_FEATURE_CLZERO	(13*32+0) /* CLZERO instruction */
-#define X86_FEATURE_IRPERF	(13*32+1) /* Instructions Retired Count */
+/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */
+#define X86_FEATURE_CLZERO		(13*32+ 0) /* CLZERO instruction */
+#define X86_FEATURE_IRPERF		(13*32+ 1) /* Instructions Retired Count */
 
-/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */
-#define X86_FEATURE_DTHERM	(14*32+ 0) /* Digital Thermal Sensor */
-#define X86_FEATURE_IDA		(14*32+ 1) /* Intel Dynamic Acceleration */
-#define X86_FEATURE_ARAT	(14*32+ 2) /* Always Running APIC Timer */
-#define X86_FEATURE_PLN		(14*32+ 4) /* Intel Power Limit Notification */
-#define X86_FEATURE_PTS		(14*32+ 6) /* Intel Package Thermal Status */
-#define X86_FEATURE_HWP		(14*32+ 7) /* Intel Hardware P-states */
-#define X86_FEATURE_HWP_NOTIFY	(14*32+ 8) /* HWP Notification */
-#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */
-#define X86_FEATURE_HWP_EPP	(14*32+10) /* HWP Energy Perf. Preference */
-#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */
+/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */
+#define X86_FEATURE_DTHERM		(14*32+ 0) /* Digital Thermal Sensor */
+#define X86_FEATURE_IDA			(14*32+ 1) /* Intel Dynamic Acceleration */
+#define X86_FEATURE_ARAT		(14*32+ 2) /* Always Running APIC Timer */
+#define X86_FEATURE_PLN			(14*32+ 4) /* Intel Power Limit Notification */
+#define X86_FEATURE_PTS			(14*32+ 6) /* Intel Package Thermal Status */
+#define X86_FEATURE_HWP			(14*32+ 7) /* Intel Hardware P-states */
+#define X86_FEATURE_HWP_NOTIFY		(14*32+ 8) /* HWP Notification */
+#define X86_FEATURE_HWP_ACT_WINDOW	(14*32+ 9) /* HWP Activity Window */
+#define X86_FEATURE_HWP_EPP		(14*32+10) /* HWP Energy Perf. Preference */
+#define X86_FEATURE_HWP_PKG_REQ		(14*32+11) /* HWP Package Level Request */
 
-/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */
-#define X86_FEATURE_NPT		(15*32+ 0) /* Nested Page Table support */
-#define X86_FEATURE_LBRV	(15*32+ 1) /* LBR Virtualization support */
-#define X86_FEATURE_SVML	(15*32+ 2) /* "svm_lock" SVM locking MSR */
-#define X86_FEATURE_NRIPS	(15*32+ 3) /* "nrip_save" SVM next_rip save */
-#define X86_FEATURE_TSCRATEMSR  (15*32+ 4) /* "tsc_scale" TSC scaling support */
-#define X86_FEATURE_VMCBCLEAN   (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
-#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */
-#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */
-#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */
-#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */
-#define X86_FEATURE_AVIC	(15*32+13) /* Virtual Interrupt Controller */
-#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */
-#define X86_FEATURE_VGIF	(15*32+16) /* Virtual GIF */
+/* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */
+#define X86_FEATURE_NPT			(15*32+ 0) /* Nested Page Table support */
+#define X86_FEATURE_LBRV		(15*32+ 1) /* LBR Virtualization support */
+#define X86_FEATURE_SVML		(15*32+ 2) /* "svm_lock" SVM locking MSR */
+#define X86_FEATURE_NRIPS		(15*32+ 3) /* "nrip_save" SVM next_rip save */
+#define X86_FEATURE_TSCRATEMSR		(15*32+ 4) /* "tsc_scale" TSC scaling support */
+#define X86_FEATURE_VMCBCLEAN		(15*32+ 5) /* "vmcb_clean" VMCB clean bits support */
+#define X86_FEATURE_FLUSHBYASID		(15*32+ 6) /* flush-by-ASID support */
+#define X86_FEATURE_DECODEASSISTS	(15*32+ 7) /* Decode Assists support */
+#define X86_FEATURE_PAUSEFILTER		(15*32+10) /* filtered pause intercept */
+#define X86_FEATURE_PFTHRESHOLD		(15*32+12) /* pause filter threshold */
+#define X86_FEATURE_AVIC		(15*32+13) /* Virtual Interrupt Controller */
+#define X86_FEATURE_V_VMSAVE_VMLOAD	(15*32+15) /* Virtual VMSAVE VMLOAD */
+#define X86_FEATURE_VGIF		(15*32+16) /* Virtual GIF */
 
-/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */
-#define X86_FEATURE_AVX512VBMI  (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
-#define X86_FEATURE_PKU		(16*32+ 3) /* Protection Keys for Userspace */
-#define X86_FEATURE_OSPKE	(16*32+ 4) /* OS Protection Keys Enable */
-#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */
-#define X86_FEATURE_LA57	(16*32+16) /* 5-level page tables */
-#define X86_FEATURE_RDPID	(16*32+22) /* RDPID instruction */
+/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */
+#define X86_FEATURE_AVX512VBMI		(16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/
+#define X86_FEATURE_UMIP		(16*32+ 2) /* User Mode Instruction Protection */
+#define X86_FEATURE_PKU			(16*32+ 3) /* Protection Keys for Userspace */
+#define X86_FEATURE_OSPKE		(16*32+ 4) /* OS Protection Keys Enable */
+#define X86_FEATURE_AVX512_VBMI2	(16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */
+#define X86_FEATURE_GFNI		(16*32+ 8) /* Galois Field New Instructions */
+#define X86_FEATURE_VAES		(16*32+ 9) /* Vector AES */
+#define X86_FEATURE_VPCLMULQDQ		(16*32+10) /* Carry-Less Multiplication Double Quadword */
+#define X86_FEATURE_AVX512_VNNI		(16*32+11) /* Vector Neural Network Instructions */
+#define X86_FEATURE_AVX512_BITALG	(16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */
+#define X86_FEATURE_AVX512_VPOPCNTDQ	(16*32+14) /* POPCNT for vectors of DW/QW */
+#define X86_FEATURE_LA57		(16*32+16) /* 5-level page tables */
+#define X86_FEATURE_RDPID		(16*32+22) /* RDPID instruction */
 
-/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */
-#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */
-#define X86_FEATURE_SUCCOR	(17*32+1) /* Uncorrectable error containment and recovery */
-#define X86_FEATURE_SMCA	(17*32+3) /* Scalable MCA */
+/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */
+#define X86_FEATURE_OVERFLOW_RECOV	(17*32+ 0) /* MCA overflow recovery support */
+#define X86_FEATURE_SUCCOR		(17*32+ 1) /* Uncorrectable error containment and recovery */
+#define X86_FEATURE_SMCA		(17*32+ 3) /* Scalable MCA */
 
 /*
  * BUG word(s)
  */
-#define X86_BUG(x)		(NCAPINTS*32 + (x))
+#define X86_BUG(x)			(NCAPINTS*32 + (x))
 
-#define X86_BUG_F00F		X86_BUG(0) /* Intel F00F */
-#define X86_BUG_FDIV		X86_BUG(1) /* FPU FDIV */
-#define X86_BUG_COMA		X86_BUG(2) /* Cyrix 6x86 coma */
-#define X86_BUG_AMD_TLB_MMATCH	X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
-#define X86_BUG_AMD_APIC_C1E	X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
-#define X86_BUG_11AP		X86_BUG(5) /* Bad local APIC aka 11AP */
-#define X86_BUG_FXSAVE_LEAK	X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
-#define X86_BUG_CLFLUSH_MONITOR	X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
-#define X86_BUG_SYSRET_SS_ATTRS	X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
+#define X86_BUG_F00F			X86_BUG(0) /* Intel F00F */
+#define X86_BUG_FDIV			X86_BUG(1) /* FPU FDIV */
+#define X86_BUG_COMA			X86_BUG(2) /* Cyrix 6x86 coma */
+#define X86_BUG_AMD_TLB_MMATCH		X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */
+#define X86_BUG_AMD_APIC_C1E		X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */
+#define X86_BUG_11AP			X86_BUG(5) /* Bad local APIC aka 11AP */
+#define X86_BUG_FXSAVE_LEAK		X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */
+#define X86_BUG_CLFLUSH_MONITOR		X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */
+#define X86_BUG_SYSRET_SS_ATTRS		X86_BUG(8) /* SYSRET doesn't fix up SS attrs */
 #ifdef CONFIG_X86_32
 /*
  * 64-bit kernels don't use X86_BUG_ESPFIX.  Make the define conditional
  * to avoid confusion.
  */
-#define X86_BUG_ESPFIX		X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */
+#define X86_BUG_ESPFIX			X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */
 #endif
-#define X86_BUG_NULL_SEG	X86_BUG(10) /* Nulling a selector preserves the base */
-#define X86_BUG_SWAPGS_FENCE	X86_BUG(11) /* SWAPGS without input dep on GS */
-#define X86_BUG_MONITOR		X86_BUG(12) /* IPI required to wake up remote CPU */
-#define X86_BUG_AMD_E400	X86_BUG(13) /* CPU is among the affected by Erratum 400 */
+#define X86_BUG_NULL_SEG		X86_BUG(10) /* Nulling a selector preserves the base */
+#define X86_BUG_SWAPGS_FENCE		X86_BUG(11) /* SWAPGS without input dep on GS */
+#define X86_BUG_MONITOR			X86_BUG(12) /* IPI required to wake up remote CPU */
+#define X86_BUG_AMD_E400		X86_BUG(13) /* CPU is among the affected by Erratum 400 */
+
 #endif /* _ASM_X86_CPUFEATURES_H */
diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h
index c10c9128f54e..14d6d5007314 100644
--- a/arch/x86/include/asm/disabled-features.h
+++ b/arch/x86/include/asm/disabled-features.h
@@ -16,6 +16,12 @@
 # define DISABLE_MPX	(1<<(X86_FEATURE_MPX & 31))
 #endif
 
+#ifdef CONFIG_X86_INTEL_UMIP
+# define DISABLE_UMIP	0
+#else
+# define DISABLE_UMIP	(1<<(X86_FEATURE_UMIP & 31))
+#endif
+
 #ifdef CONFIG_X86_64
 # define DISABLE_VME		(1<<(X86_FEATURE_VME & 31))
 # define DISABLE_K6_MTRR	(1<<(X86_FEATURE_K6_MTRR & 31))
@@ -63,7 +69,7 @@
 #define DISABLED_MASK13	0
 #define DISABLED_MASK14	0
 #define DISABLED_MASK15	0
-#define DISABLED_MASK16	(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57)
+#define DISABLED_MASK16	(DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP)
 #define DISABLED_MASK17	0
 #define DISABLED_MASK_CHECK BUILD_BUG_ON_ZERO(NCAPINTS != 18)
 
diff --git a/arch/x86/include/asm/inat.h b/arch/x86/include/asm/inat.h
index 02aff0867211..1c78580e58be 100644
--- a/arch/x86/include/asm/inat.h
+++ b/arch/x86/include/asm/inat.h
@@ -97,6 +97,16 @@
 #define INAT_MAKE_GROUP(grp)	((grp << INAT_GRP_OFFS) | INAT_MODRM)
 #define INAT_MAKE_IMM(imm)	(imm << INAT_IMM_OFFS)
 
+/* Identifiers for segment registers */
+#define INAT_SEG_REG_IGNORE	0
+#define INAT_SEG_REG_DEFAULT	1
+#define INAT_SEG_REG_CS		2
+#define INAT_SEG_REG_SS		3
+#define INAT_SEG_REG_DS		4
+#define INAT_SEG_REG_ES		5
+#define INAT_SEG_REG_FS		6
+#define INAT_SEG_REG_GS		7
+
 /* Attribute search APIs */
 extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode);
 extern int inat_get_last_prefix_id(insn_byte_t last_pfx);
diff --git a/arch/x86/include/asm/insn-eval.h b/arch/x86/include/asm/insn-eval.h
new file mode 100644
index 000000000000..e1d3b4ce8a92
--- /dev/null
+++ b/arch/x86/include/asm/insn-eval.h
@@ -0,0 +1,23 @@
+#ifndef _ASM_X86_INSN_EVAL_H
+#define _ASM_X86_INSN_EVAL_H
+/*
+ * A collection of utility functions for x86 instruction analysis to be
+ * used in a kernel context. Useful when, for instance, making sense
+ * of the registers indicated by operands.
+ */
+
+#include <linux/compiler.h>
+#include <linux/bug.h>
+#include <linux/err.h>
+#include <asm/ptrace.h>
+
+#define INSN_CODE_SEG_ADDR_SZ(params) ((params >> 4) & 0xf)
+#define INSN_CODE_SEG_OPND_SZ(params) (params & 0xf)
+#define INSN_CODE_SEG_PARAMS(oper_sz, addr_sz) (oper_sz | (addr_sz << 4))
+
+void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs);
+int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs);
+unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx);
+char insn_get_code_seg_params(struct pt_regs *regs);
+
+#endif /* _ASM_X86_INSN_EVAL_H */
diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h
index 11398d55aefa..93ae8aee1780 100644
--- a/arch/x86/include/asm/io.h
+++ b/arch/x86/include/asm/io.h
@@ -266,6 +266,21 @@ static inline void slow_down_io(void)
 
 #endif
 
+#ifdef CONFIG_AMD_MEM_ENCRYPT
+#include <linux/jump_label.h>
+
+extern struct static_key_false sev_enable_key;
+static inline bool sev_key_active(void)
+{
+	return static_branch_unlikely(&sev_enable_key);
+}
+
+#else /* !CONFIG_AMD_MEM_ENCRYPT */
+
+static inline bool sev_key_active(void) { return false; }
+
+#endif /* CONFIG_AMD_MEM_ENCRYPT */
+
 #define BUILDIO(bwl, bw, type)						\
 static inline void out##bwl(unsigned type value, int port)		\
 {									\
@@ -296,14 +311,34 @@ static inline unsigned type in##bwl##_p(int port)			\
 									\
 static inline void outs##bwl(int port, const void *addr, unsigned long count) \
 {									\
-	asm volatile("rep; outs" #bwl					\
-		     : "+S"(addr), "+c"(count) : "d"(port) : "memory");	\
+	if (sev_key_active()) {						\
+		unsigned type *value = (unsigned type *)addr;		\
+		while (count) {						\
+			out##bwl(*value, port);				\
+			value++;					\
+			count--;					\
+		}							\
+	} else {							\
+		asm volatile("rep; outs" #bwl				\
+			     : "+S"(addr), "+c"(count)			\
+			     : "d"(port) : "memory");			\
+	}								\
 }									\
 									\
 static inline void ins##bwl(int port, void *addr, unsigned long count)	\
 {									\
-	asm volatile("rep; ins" #bwl					\
-		     : "+D"(addr), "+c"(count) : "d"(port) : "memory");	\
+	if (sev_key_active()) {						\
+		unsigned type *value = (unsigned type *)addr;		\
+		while (count) {						\
+			*value = in##bwl(port);				\
+			value++;					\
+			count--;					\
+		}							\
+	} else {							\
+		asm volatile("rep; ins" #bwl				\
+			     : "+D"(addr), "+c"(count)			\
+			     : "d"(port) : "memory");			\
+	}								\
 }
 
 BUILDIO(b, b, char)
diff --git a/arch/x86/include/asm/mem_encrypt.h b/arch/x86/include/asm/mem_encrypt.h
index 6a77c63540f7..c9459a4c3c68 100644
--- a/arch/x86/include/asm/mem_encrypt.h
+++ b/arch/x86/include/asm/mem_encrypt.h
@@ -42,11 +42,17 @@ void __init sme_early_init(void);
 void __init sme_encrypt_kernel(void);
 void __init sme_enable(struct boot_params *bp);
 
+int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size);
+int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size);
+
 /* Architecture __weak replacement functions */
 void __init mem_encrypt_init(void);
 
 void swiotlb_set_mem_attributes(void *vaddr, unsigned long size);
 
+bool sme_active(void);
+bool sev_active(void);
+
 #else	/* !CONFIG_AMD_MEM_ENCRYPT */
 
 #define sme_me_mask	0ULL
@@ -64,6 +70,14 @@ static inline void __init sme_early_init(void) { }
 static inline void __init sme_encrypt_kernel(void) { }
 static inline void __init sme_enable(struct boot_params *bp) { }
 
+static inline bool sme_active(void) { return false; }
+static inline bool sev_active(void) { return false; }
+
+static inline int __init
+early_set_memory_decrypted(unsigned long vaddr, unsigned long size) { return 0; }
+static inline int __init
+early_set_memory_encrypted(unsigned long vaddr, unsigned long size) { return 0; }
+
 #endif	/* CONFIG_AMD_MEM_ENCRYPT */
 
 /*
diff --git a/arch/x86/include/asm/module.h b/arch/x86/include/asm/module.h
index 8546fafa21a9..7948a17febb4 100644
--- a/arch/x86/include/asm/module.h
+++ b/arch/x86/include/asm/module.h
@@ -6,7 +6,7 @@
 #include <asm/orc_types.h>
 
 struct mod_arch_specific {
-#ifdef CONFIG_ORC_UNWINDER
+#ifdef CONFIG_UNWINDER_ORC
 	unsigned int num_orcs;
 	int *orc_unwind_ip;
 	struct orc_entry *orc_unwind;
diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h
index ab022618a50a..34c4922bbc3f 100644
--- a/arch/x86/include/asm/msr-index.h
+++ b/arch/x86/include/asm/msr-index.h
@@ -324,6 +324,9 @@
 #define MSR_AMD64_IBSBRTARGET		0xc001103b
 #define MSR_AMD64_IBSOPDATA4		0xc001103d
 #define MSR_AMD64_IBS_REG_COUNT_MAX	8 /* includes MSR_AMD64_IBSBRTARGET */
+#define MSR_AMD64_SEV			0xc0010131
+#define MSR_AMD64_SEV_ENABLED_BIT	0
+#define MSR_AMD64_SEV_ENABLED		BIT_ULL(MSR_AMD64_SEV_ENABLED_BIT)
 
 /* Fam 17h MSRs */
 #define MSR_F17H_IRPERF			0xc00000e9
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index fd81228e8037..283efcaac8af 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -16,10 +16,9 @@
 #include <linux/cpumask.h>
 #include <asm/frame.h>
 
-static inline void load_sp0(struct tss_struct *tss,
-			     struct thread_struct *thread)
+static inline void load_sp0(unsigned long sp0)
 {
-	PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread);
+	PVOP_VCALL1(pv_cpu_ops.load_sp0, sp0);
 }
 
 /* The paravirtualized CPUID instruction. */
diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h
index 10cc3b9709fe..6ec54d01972d 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -134,7 +134,7 @@ struct pv_cpu_ops {
 	void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries);
 	void (*free_ldt)(struct desc_struct *ldt, unsigned entries);
 
-	void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t);
+	void (*load_sp0)(unsigned long sp0);
 
 	void (*set_iopl_mask)(unsigned mask);
 
diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 377f1ffd18be..ba3c523aaf16 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -526,7 +526,7 @@ static inline bool x86_this_cpu_variable_test_bit(int nr,
 {
 	bool oldbit;
 
-	asm volatile("bt "__percpu_arg(2)",%1\n\t"
+	asm volatile("bt "__percpu_arg(2)",%1"
 			CC_SET(c)
 			: CC_OUT(c) (oldbit)
 			: "m" (*(unsigned long __percpu *)addr), "Ir" (nr));
diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
index 59df7b47a434..9e9b05fc4860 100644
--- a/arch/x86/include/asm/pgtable_types.h
+++ b/arch/x86/include/asm/pgtable_types.h
@@ -200,10 +200,9 @@ enum page_cache_mode {
 
 #define _PAGE_ENC	(_AT(pteval_t, sme_me_mask))
 
-#define _PAGE_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_USER |	\
-			 _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_ENC)
 #define _KERNPG_TABLE	(_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED |	\
 			 _PAGE_DIRTY | _PAGE_ENC)
+#define _PAGE_TABLE	(_KERNPG_TABLE | _PAGE_USER)
 
 #define __PAGE_KERNEL_ENC	(__PAGE_KERNEL | _PAGE_ENC)
 #define __PAGE_KERNEL_ENC_WP	(__PAGE_KERNEL_WP | _PAGE_ENC)
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index bdac19ab2488..2db7cf720b04 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -431,7 +431,9 @@ typedef struct {
 struct thread_struct {
 	/* Cached TLS descriptors: */
 	struct desc_struct	tls_array[GDT_ENTRY_TLS_ENTRIES];
+#ifdef CONFIG_X86_32
 	unsigned long		sp0;
+#endif
 	unsigned long		sp;
 #ifdef CONFIG_X86_32
 	unsigned long		sysenter_cs;
@@ -518,16 +520,9 @@ static inline void native_set_iopl_mask(unsigned mask)
 }
 
 static inline void
-native_load_sp0(struct tss_struct *tss, struct thread_struct *thread)
+native_load_sp0(unsigned long sp0)
 {
-	tss->x86_tss.sp0 = thread->sp0;
-#ifdef CONFIG_X86_32
-	/* Only happens when SEP is enabled, no need to test "SEP"arately: */
-	if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) {
-		tss->x86_tss.ss1 = thread->sysenter_cs;
-		wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
-	}
-#endif
+	this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
 }
 
 static inline void native_swapgs(void)
@@ -547,15 +542,20 @@ static inline unsigned long current_top_of_stack(void)
 #endif
 }
 
+static inline bool on_thread_stack(void)
+{
+	return (unsigned long)(current_top_of_stack() -
+			       current_stack_pointer) < THREAD_SIZE;
+}
+
 #ifdef CONFIG_PARAVIRT
 #include <asm/paravirt.h>
 #else
 #define __cpuid			native_cpuid
 
-static inline void load_sp0(struct tss_struct *tss,
-			    struct thread_struct *thread)
+static inline void load_sp0(unsigned long sp0)
 {
-	native_load_sp0(tss, thread);
+	native_load_sp0(sp0);
 }
 
 #define set_iopl_mask native_set_iopl_mask
@@ -804,6 +804,15 @@ static inline void spin_lock_prefetch(const void *x)
 #define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
 			   TOP_OF_KERNEL_STACK_PADDING)
 
+#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1))
+
+#define task_pt_regs(task) \
+({									\
+	unsigned long __ptr = (unsigned long)task_stack_page(task);	\
+	__ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;		\
+	((struct pt_regs *)__ptr) - 1;					\
+})
+
 #ifdef CONFIG_X86_32
 /*
  * User space process size: 3GB (default).
@@ -823,23 +832,6 @@ static inline void spin_lock_prefetch(const void *x)
 	.addr_limit		= KERNEL_DS,				  \
 }
 
-/*
- * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack.
- * This is necessary to guarantee that the entire "struct pt_regs"
- * is accessible even if the CPU haven't stored the SS/ESP registers
- * on the stack (interrupt gate does not save these registers
- * when switching to the same priv ring).
- * Therefore beware: accessing the ss/esp fields of the
- * "struct pt_regs" is possible, but they may contain the
- * completely wrong values.
- */
-#define task_pt_regs(task) \
-({									\
-	unsigned long __ptr = (unsigned long)task_stack_page(task);	\
-	__ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING;		\
-	((struct pt_regs *)__ptr) - 1;					\
-})
-
 #define KSTK_ESP(task)		(task_pt_regs(task)->sp)
 
 #else
@@ -873,11 +865,9 @@ static inline void spin_lock_prefetch(const void *x)
 #define STACK_TOP_MAX		TASK_SIZE_MAX
 
 #define INIT_THREAD  {						\
-	.sp0			= TOP_OF_INIT_STACK,		\
 	.addr_limit		= KERNEL_DS,			\
 }
 
-#define task_pt_regs(tsk)	((struct pt_regs *)(tsk)->thread.sp0 - 1)
 extern unsigned long KSTK_ESP(struct task_struct *task);
 
 #endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index c0e3c45cf6ab..14131dd06b29 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -136,9 +136,9 @@ static inline int v8086_mode(struct pt_regs *regs)
 #endif
 }
 
-#ifdef CONFIG_X86_64
 static inline bool user_64bit_mode(struct pt_regs *regs)
 {
+#ifdef CONFIG_X86_64
 #ifndef CONFIG_PARAVIRT
 	/*
 	 * On non-paravirt systems, this is the only long mode CPL 3
@@ -149,8 +149,12 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
 	/* Headers are too twisted for this to go in paravirt.h. */
 	return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs;
 #endif
+#else /* !CONFIG_X86_64 */
+	return false;
+#endif
 }
 
+#ifdef CONFIG_X86_64
 #define current_user_stack_pointer()	current_pt_regs()->sp
 #define compat_user_stack_pointer()	current_pt_regs()->sp
 #endif
diff --git a/arch/x86/include/asm/rmwcc.h b/arch/x86/include/asm/rmwcc.h
index d8f3a6ae9f6c..f91c365e57c3 100644
--- a/arch/x86/include/asm/rmwcc.h
+++ b/arch/x86/include/asm/rmwcc.h
@@ -29,7 +29,7 @@ cc_label:								\
 #define __GEN_RMWcc(fullop, var, cc, clobbers, ...)			\
 do {									\
 	bool c;								\
-	asm volatile (fullop ";" CC_SET(cc)				\
+	asm volatile (fullop CC_SET(cc)					\
 			: [counter] "+m" (var), CC_OUT(cc) (c)		\
 			: __VA_ARGS__ : clobbers);			\
 	return c;							\
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 899084b70412..8c6bd6863db9 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -2,6 +2,8 @@
 #ifndef _ASM_X86_SWITCH_TO_H
 #define _ASM_X86_SWITCH_TO_H
 
+#include <linux/sched/task_stack.h>
+
 struct task_struct; /* one of the stranger aspects of C forward declarations */
 
 struct task_struct *__switch_to_asm(struct task_struct *prev,
@@ -73,4 +75,26 @@ do {									\
 	((last) = __switch_to_asm((prev), (next)));			\
 } while (0)
 
+#ifdef CONFIG_X86_32
+static inline void refresh_sysenter_cs(struct thread_struct *thread)
+{
+	/* Only happens when SEP is enabled, no need to test "SEP"arately: */
+	if (unlikely(this_cpu_read(cpu_tss.x86_tss.ss1) == thread->sysenter_cs))
+		return;
+
+	this_cpu_write(cpu_tss.x86_tss.ss1, thread->sysenter_cs);
+	wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0);
+}
+#endif
+
+/* This is used when switching tasks or entering/exiting vm86 mode. */
+static inline void update_sp0(struct task_struct *task)
+{
+#ifdef CONFIG_X86_32
+	load_sp0(task->thread.sp0);
+#else
+	load_sp0(task_top_of_stack(task));
+#endif
+}
+
 #endif /* _ASM_X86_SWITCH_TO_H */
diff --git a/arch/x86/include/asm/syscalls.h b/arch/x86/include/asm/syscalls.h
index 91dfcafe27a6..bad25bb80679 100644
--- a/arch/x86/include/asm/syscalls.h
+++ b/arch/x86/include/asm/syscalls.h
@@ -21,7 +21,7 @@ asmlinkage long sys_ioperm(unsigned long, unsigned long, int);
 asmlinkage long sys_iopl(unsigned int);
 
 /* kernel/ldt.c */
-asmlinkage int sys_modify_ldt(int, void __user *, unsigned long);
+asmlinkage long sys_modify_ldt(int, void __user *, unsigned long);
 
 /* kernel/signal.c */
 asmlinkage long sys_rt_sigreturn(void);
diff --git a/arch/x86/include/asm/trace/fpu.h b/arch/x86/include/asm/trace/fpu.h
index fa60398bbc3a..069c04be1507 100644
--- a/arch/x86/include/asm/trace/fpu.h
+++ b/arch/x86/include/asm/trace/fpu.h
@@ -34,11 +34,6 @@ DECLARE_EVENT_CLASS(x86_fpu,
 	)
 );
 
-DEFINE_EVENT(x86_fpu, x86_fpu_state,
-	TP_PROTO(struct fpu *fpu),
-	TP_ARGS(fpu)
-);
-
 DEFINE_EVENT(x86_fpu, x86_fpu_before_save,
 	TP_PROTO(struct fpu *fpu),
 	TP_ARGS(fpu)
@@ -74,11 +69,6 @@ DEFINE_EVENT(x86_fpu, x86_fpu_activate_state,
 	TP_ARGS(fpu)
 );
 
-DEFINE_EVENT(x86_fpu, x86_fpu_deactivate_state,
-	TP_PROTO(struct fpu *fpu),
-	TP_ARGS(fpu)
-);
-
 DEFINE_EVENT(x86_fpu, x86_fpu_init_state,
 	TP_PROTO(struct fpu *fpu),
 	TP_ARGS(fpu)
diff --git a/arch/x86/include/asm/traps.h b/arch/x86/include/asm/traps.h
index b0cced97a6ce..1fadd310ff68 100644
--- a/arch/x86/include/asm/traps.h
+++ b/arch/x86/include/asm/traps.h
@@ -38,9 +38,9 @@ asmlinkage void simd_coprocessor_error(void);
 
 #if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV)
 asmlinkage void xen_divide_error(void);
+asmlinkage void xen_xennmi(void);
 asmlinkage void xen_xendebug(void);
 asmlinkage void xen_xenint3(void);
-asmlinkage void xen_nmi(void);
 asmlinkage void xen_overflow(void);
 asmlinkage void xen_bounds(void);
 asmlinkage void xen_invalid_op(void);
@@ -145,4 +145,22 @@ enum {
 	X86_TRAP_IRET = 32,	/* 32, IRET Exception */
 };
 
+/*
+ * Page fault error code bits:
+ *
+ *   bit 0 ==	 0: no page found	1: protection fault
+ *   bit 1 ==	 0: read access		1: write access
+ *   bit 2 ==	 0: kernel-mode access	1: user-mode access
+ *   bit 3 ==				1: use of reserved bit detected
+ *   bit 4 ==				1: fault was an instruction fetch
+ *   bit 5 ==				1: protection keys block access
+ */
+enum x86_pf_error_code {
+	X86_PF_PROT	=		1 << 0,
+	X86_PF_WRITE	=		1 << 1,
+	X86_PF_USER	=		1 << 2,
+	X86_PF_RSVD	=		1 << 3,
+	X86_PF_INSTR	=		1 << 4,
+	X86_PF_PK	=		1 << 5,
+};
 #endif /* _ASM_X86_TRAPS_H */
diff --git a/arch/x86/include/asm/umip.h b/arch/x86/include/asm/umip.h
new file mode 100644
index 000000000000..db43f2a0d92c
--- /dev/null
+++ b/arch/x86/include/asm/umip.h
@@ -0,0 +1,12 @@
+#ifndef _ASM_X86_UMIP_H
+#define _ASM_X86_UMIP_H
+
+#include <linux/types.h>
+#include <asm/ptrace.h>
+
+#ifdef CONFIG_X86_INTEL_UMIP
+bool fixup_umip_exception(struct pt_regs *regs);
+#else
+static inline bool fixup_umip_exception(struct pt_regs *regs) { return false; }
+#endif  /* CONFIG_X86_INTEL_UMIP */
+#endif  /* _ASM_X86_UMIP_H */
diff --git a/arch/x86/include/asm/unwind.h b/arch/x86/include/asm/unwind.h
index 87adc0d38c4a..e9cc6fe1fc6f 100644
--- a/arch/x86/include/asm/unwind.h
+++ b/arch/x86/include/asm/unwind.h
@@ -13,11 +13,11 @@ struct unwind_state {
 	struct task_struct *task;
 	int graph_idx;
 	bool error;
-#if defined(CONFIG_ORC_UNWINDER)
+#if defined(CONFIG_UNWINDER_ORC)
 	bool signal, full_regs;
 	unsigned long sp, bp, ip;
 	struct pt_regs *regs;
-#elif defined(CONFIG_FRAME_POINTER_UNWINDER)
+#elif defined(CONFIG_UNWINDER_FRAME_POINTER)
 	bool got_irq;
 	unsigned long *bp, *orig_sp, ip;
 	struct pt_regs *regs;
@@ -51,7 +51,7 @@ void unwind_start(struct unwind_state *state, struct task_struct *task,
 	__unwind_start(state, task, regs, first_frame);
 }
 
-#if defined(CONFIG_ORC_UNWINDER) || defined(CONFIG_FRAME_POINTER_UNWINDER)
+#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER)
 static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
 {
 	if (unwind_done(state))
@@ -66,7 +66,7 @@ static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state)
 }
 #endif
 
-#ifdef CONFIG_ORC_UNWINDER
+#ifdef CONFIG_UNWINDER_ORC
 void unwind_init(void);
 void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size,
 			void *orc, size_t orc_size);
diff --git a/arch/x86/include/uapi/asm/kvm_para.h b/arch/x86/include/uapi/asm/kvm_para.h
index 554aa8f24f91..09cc06483bed 100644
--- a/arch/x86/include/uapi/asm/kvm_para.h
+++ b/arch/x86/include/uapi/asm/kvm_para.h
@@ -110,5 +110,4 @@ struct kvm_vcpu_pv_apf_data {
 #define KVM_PV_EOI_ENABLED KVM_PV_EOI_MASK
 #define KVM_PV_EOI_DISABLED 0x0
 
-
 #endif /* _UAPI_ASM_X86_KVM_PARA_H */
diff --git a/arch/x86/include/uapi/asm/processor-flags.h b/arch/x86/include/uapi/asm/processor-flags.h
index 6f3355399665..7e1e730396ae 100644
--- a/arch/x86/include/uapi/asm/processor-flags.h
+++ b/arch/x86/include/uapi/asm/processor-flags.h
@@ -105,6 +105,8 @@
 #define X86_CR4_OSFXSR		_BITUL(X86_CR4_OSFXSR_BIT)
 #define X86_CR4_OSXMMEXCPT_BIT	10 /* enable unmasked SSE exceptions */
 #define X86_CR4_OSXMMEXCPT	_BITUL(X86_CR4_OSXMMEXCPT_BIT)
+#define X86_CR4_UMIP_BIT	11 /* enable UMIP support */
+#define X86_CR4_UMIP		_BITUL(X86_CR4_UMIP_BIT)
 #define X86_CR4_LA57_BIT	12 /* enable 5-level page tables */
 #define X86_CR4_LA57		_BITUL(X86_CR4_LA57_BIT)
 #define X86_CR4_VMXE_BIT	13 /* enable VMX virtualization */
@@ -152,5 +154,8 @@
 #define CX86_ARR_BASE	0xc4
 #define CX86_RCR_BASE	0xdc
 
+#define CR0_STATE	(X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
+			 X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
+			 X86_CR0_PG)
 
 #endif /* _UAPI_ASM_X86_PROCESSOR_FLAGS_H */
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index 5f70044340ff..81bb565f4497 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -25,9 +25,9 @@ endif
 KASAN_SANITIZE_head$(BITS).o				:= n
 KASAN_SANITIZE_dumpstack.o				:= n
 KASAN_SANITIZE_dumpstack_$(BITS).o			:= n
-KASAN_SANITIZE_stacktrace.o := n
+KASAN_SANITIZE_stacktrace.o				:= n
+KASAN_SANITIZE_paravirt.o				:= n
 
-OBJECT_FILES_NON_STANDARD_head_$(BITS).o		:= y
 OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o	:= y
 OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o		:= y
 OBJECT_FILES_NON_STANDARD_test_nx.o			:= y
@@ -127,10 +127,11 @@ obj-$(CONFIG_EFI)			+= sysfb_efi.o
 obj-$(CONFIG_PERF_EVENTS)		+= perf_regs.o
 obj-$(CONFIG_TRACING)			+= tracepoint.o
 obj-$(CONFIG_SCHED_MC_PRIO)		+= itmt.o
+obj-$(CONFIG_X86_INTEL_UMIP)		+= umip.o
 
-obj-$(CONFIG_ORC_UNWINDER)		+= unwind_orc.o
-obj-$(CONFIG_FRAME_POINTER_UNWINDER)	+= unwind_frame.o
-obj-$(CONFIG_GUESS_UNWINDER)		+= unwind_guess.o
+obj-$(CONFIG_UNWINDER_ORC)		+= unwind_orc.o
+obj-$(CONFIG_UNWINDER_FRAME_POINTER)	+= unwind_frame.o
+obj-$(CONFIG_UNWINDER_GUESS)		+= unwind_guess.o
 
 ###
 # 64 bit specific files
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 3344d3382e91..dbaf14d69ebd 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -442,7 +442,6 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end,
 {
 	const s32 *poff;
 
-	mutex_lock(&text_mutex);
 	for (poff = start; poff < end; poff++) {
 		u8 *ptr = (u8 *)poff + *poff;
 
@@ -452,7 +451,6 @@ static void alternatives_smp_lock(const s32 *start, const s32 *end,
 		if (*ptr == 0x3e)
 			text_poke(ptr, ((unsigned char []){0xf0}), 1);
 	}
-	mutex_unlock(&text_mutex);
 }
 
 static void alternatives_smp_unlock(const s32 *start, const s32 *end,
@@ -460,7 +458,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
 {
 	const s32 *poff;
 
-	mutex_lock(&text_mutex);
 	for (poff = start; poff < end; poff++) {
 		u8 *ptr = (u8 *)poff + *poff;
 
@@ -470,7 +467,6 @@ static void alternatives_smp_unlock(const s32 *start, const s32 *end,
 		if (*ptr == 0xf0)
 			text_poke(ptr, ((unsigned char []){0x3E}), 1);
 	}
-	mutex_unlock(&text_mutex);
 }
 
 struct smp_alt_module {
@@ -489,8 +485,7 @@ struct smp_alt_module {
 	struct list_head next;
 };
 static LIST_HEAD(smp_alt_modules);
-static DEFINE_MUTEX(smp_alt);
-static bool uniproc_patched = false;	/* protected by smp_alt */
+static bool uniproc_patched = false;	/* protected by text_mutex */
 
 void __init_or_module alternatives_smp_module_add(struct module *mod,
 						  char *name,
@@ -499,7 +494,7 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
 {
 	struct smp_alt_module *smp;
 
-	mutex_lock(&smp_alt);
+	mutex_lock(&text_mutex);
 	if (!uniproc_patched)
 		goto unlock;
 
@@ -526,14 +521,14 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
 smp_unlock:
 	alternatives_smp_unlock(locks, locks_end, text, text_end);
 unlock:
-	mutex_unlock(&smp_alt);
+	mutex_unlock(&text_mutex);
 }
 
 void __init_or_module alternatives_smp_module_del(struct module *mod)
 {
 	struct smp_alt_module *item;
 
-	mutex_lock(&smp_alt);
+	mutex_lock(&text_mutex);
 	list_for_each_entry(item, &smp_alt_modules, next) {
 		if (mod != item->mod)
 			continue;
@@ -541,7 +536,7 @@ void __init_or_module alternatives_smp_module_del(struct module *mod)
 		kfree(item);
 		break;
 	}
-	mutex_unlock(&smp_alt);
+	mutex_unlock(&text_mutex);
 }
 
 void alternatives_enable_smp(void)
@@ -551,7 +546,7 @@ void alternatives_enable_smp(void)
 	/* Why bother if there are no other CPUs? */
 	BUG_ON(num_possible_cpus() == 1);
 
-	mutex_lock(&smp_alt);
+	mutex_lock(&text_mutex);
 
 	if (uniproc_patched) {
 		pr_info("switching to SMP code\n");
@@ -563,10 +558,13 @@ void alternatives_enable_smp(void)
 					      mod->text, mod->text_end);
 		uniproc_patched = false;
 	}
-	mutex_unlock(&smp_alt);
+	mutex_unlock(&text_mutex);
 }
 
-/* Return 1 if the address range is reserved for smp-alternatives */
+/*
+ * Return 1 if the address range is reserved for SMP-alternatives.
+ * Must hold text_mutex.
+ */
 int alternatives_text_reserved(void *start, void *end)
 {
 	struct smp_alt_module *mod;
@@ -574,6 +572,8 @@ int alternatives_text_reserved(void *start, void *end)
 	u8 *text_start = start;
 	u8 *text_end = end;
 
+	lockdep_assert_held(&text_mutex);
+
 	list_for_each_entry(mod, &smp_alt_modules, next) {
 		if (mod->text > text_end || mod->text_end < text_start)
 			continue;
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index c60922a66385..90cb82dbba57 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -23,6 +23,7 @@ obj-y			+= rdrand.o
 obj-y			+= match.o
 obj-y			+= bugs.o
 obj-$(CONFIG_CPU_FREQ)	+= aperfmperf.o
+obj-y			+= cpuid-deps.o
 
 obj-$(CONFIG_PROC_FS)	+= proc.o
 obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index c9176bae7fd8..47f8a85be11c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -329,6 +329,28 @@ static __always_inline void setup_smap(struct cpuinfo_x86 *c)
 	}
 }
 
+static __always_inline void setup_umip(struct cpuinfo_x86 *c)
+{
+	/* Check the boot processor, plus build option for UMIP. */
+	if (!cpu_feature_enabled(X86_FEATURE_UMIP))
+		goto out;
+
+	/* Check the current processor's cpuid bits. */
+	if (!cpu_has(c, X86_FEATURE_UMIP))
+		goto out;
+
+	cr4_set_bits(X86_CR4_UMIP);
+
+	return;
+
+out:
+	/*
+	 * Make sure UMIP is disabled in case it was enabled in a
+	 * previous boot (e.g., via kexec).
+	 */
+	cr4_clear_bits(X86_CR4_UMIP);
+}
+
 /*
  * Protection Keys are not available in 32-bit mode.
  */
@@ -1147,9 +1169,10 @@ static void identify_cpu(struct cpuinfo_x86 *c)
 	/* Disable the PN if appropriate */
 	squash_the_stupid_serial_number(c);
 
-	/* Set up SMEP/SMAP */
+	/* Set up SMEP/SMAP/UMIP */
 	setup_smep(c);
 	setup_smap(c);
+	setup_umip(c);
 
 	/*
 	 * The vendor-specific functions might have changed features.
@@ -1301,18 +1324,16 @@ void print_cpu_info(struct cpuinfo_x86 *c)
 		pr_cont(")\n");
 }
 
-static __init int setup_disablecpuid(char *arg)
+/*
+ * clearcpuid= was already parsed in fpu__init_parse_early_param.
+ * But we need to keep a dummy __setup around otherwise it would
+ * show up as an environment variable for init.
+ */
+static __init int setup_clearcpuid(char *arg)
 {
-	int bit;
-
-	if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32)
-		setup_clear_cpu_cap(bit);
-	else
-		return 0;
-
 	return 1;
 }
-__setup("clearcpuid=", setup_disablecpuid);
+__setup("clearcpuid=", setup_clearcpuid);
 
 #ifdef CONFIG_X86_64
 DEFINE_PER_CPU_FIRST(union irq_stack_union,
@@ -1572,9 +1593,13 @@ void cpu_init(void)
 	initialize_tlbstate_and_flush();
 	enter_lazy_tlb(&init_mm, me);
 
-	load_sp0(t, &current->thread);
+	/*
+	 * Initialize the TSS.  Don't bother initializing sp0, as the initial
+	 * task never enters user mode.
+	 */
 	set_tss_desc(cpu, t);
 	load_TR_desc();
+
 	load_mm_ldt(&init_mm);
 
 	clear_all_debug_regs();
@@ -1596,7 +1621,6 @@ void cpu_init(void)
 	int cpu = smp_processor_id();
 	struct task_struct *curr = current;
 	struct tss_struct *t = &per_cpu(cpu_tss, cpu);
-	struct thread_struct *thread = &curr->thread;
 
 	wait_for_master_cpu(cpu);
 
@@ -1627,9 +1651,13 @@ void cpu_init(void)
 	initialize_tlbstate_and_flush();
 	enter_lazy_tlb(&init_mm, curr);
 
-	load_sp0(t, thread);
+	/*
+	 * Initialize the TSS.  Don't bother initializing sp0, as the initial
+	 * task never enters user mode.
+	 */
 	set_tss_desc(cpu, t);
 	load_TR_desc();
+
 	load_mm_ldt(&init_mm);
 
 	t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap);
diff --git a/arch/x86/kernel/cpu/cpuid-deps.c b/arch/x86/kernel/cpu/cpuid-deps.c
new file mode 100644
index 000000000000..904b0a3c4e53
--- /dev/null
+++ b/arch/x86/kernel/cpu/cpuid-deps.c
@@ -0,0 +1,121 @@
+/* Declare dependencies between CPUIDs */
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <asm/cpufeature.h>
+
+struct cpuid_dep {
+	unsigned int	feature;
+	unsigned int	depends;
+};
+
+/*
+ * Table of CPUID features that depend on others.
+ *
+ * This only includes dependencies that can be usefully disabled, not
+ * features part of the base set (like FPU).
+ *
+ * Note this all is not __init / __initdata because it can be
+ * called from cpu hotplug. It shouldn't do anything in this case,
+ * but it's difficult to tell that to the init reference checker.
+ */
+const static struct cpuid_dep cpuid_deps[] = {
+	{ X86_FEATURE_XSAVEOPT,		X86_FEATURE_XSAVE     },
+	{ X86_FEATURE_XSAVEC,		X86_FEATURE_XSAVE     },
+	{ X86_FEATURE_XSAVES,		X86_FEATURE_XSAVE     },
+	{ X86_FEATURE_AVX,		X86_FEATURE_XSAVE     },
+	{ X86_FEATURE_PKU,		X86_FEATURE_XSAVE     },
+	{ X86_FEATURE_MPX,		X86_FEATURE_XSAVE     },
+	{ X86_FEATURE_XGETBV1,		X86_FEATURE_XSAVE     },
+	{ X86_FEATURE_FXSR_OPT,		X86_FEATURE_FXSR      },
+	{ X86_FEATURE_XMM,		X86_FEATURE_FXSR      },
+	{ X86_FEATURE_XMM2,		X86_FEATURE_XMM       },
+	{ X86_FEATURE_XMM3,		X86_FEATURE_XMM2      },
+	{ X86_FEATURE_XMM4_1,		X86_FEATURE_XMM2      },
+	{ X86_FEATURE_XMM4_2,		X86_FEATURE_XMM2      },
+	{ X86_FEATURE_XMM3,		X86_FEATURE_XMM2      },
+	{ X86_FEATURE_PCLMULQDQ,	X86_FEATURE_XMM2      },
+	{ X86_FEATURE_SSSE3,		X86_FEATURE_XMM2,     },
+	{ X86_FEATURE_F16C,		X86_FEATURE_XMM2,     },
+	{ X86_FEATURE_AES,		X86_FEATURE_XMM2      },
+	{ X86_FEATURE_SHA_NI,		X86_FEATURE_XMM2      },
+	{ X86_FEATURE_FMA,		X86_FEATURE_AVX       },
+	{ X86_FEATURE_AVX2,		X86_FEATURE_AVX,      },
+	{ X86_FEATURE_AVX512F,		X86_FEATURE_AVX,      },
+	{ X86_FEATURE_AVX512IFMA,	X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512PF,		X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512ER,		X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512CD,		X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512DQ,		X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512BW,		X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512VL,		X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512VBMI,	X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512_VBMI2,	X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_GFNI,		X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_VAES,		X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_VPCLMULQDQ,	X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_AVX512_VNNI,	X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_AVX512_BITALG,	X86_FEATURE_AVX512VL  },
+	{ X86_FEATURE_AVX512_4VNNIW,	X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512_4FMAPS,	X86_FEATURE_AVX512F   },
+	{ X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F   },
+	{}
+};
+
+static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature)
+{
+	/*
+	 * Note: This could use the non atomic __*_bit() variants, but the
+	 * rest of the cpufeature code uses atomics as well, so keep it for
+	 * consistency. Cleanup all of it separately.
+	 */
+	if (!c) {
+		clear_cpu_cap(&boot_cpu_data, feature);
+		set_bit(feature, (unsigned long *)cpu_caps_cleared);
+	} else {
+		clear_bit(feature, (unsigned long *)c->x86_capability);
+	}
+}
+
+/* Take the capabilities and the BUG bits into account */
+#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8)
+
+static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+	DECLARE_BITMAP(disable, MAX_FEATURE_BITS);
+	const struct cpuid_dep *d;
+	bool changed;
+
+	if (WARN_ON(feature >= MAX_FEATURE_BITS))
+		return;
+
+	clear_feature(c, feature);
+
+	/* Collect all features to disable, handling dependencies */
+	memset(disable, 0, sizeof(disable));
+	__set_bit(feature, disable);
+
+	/* Loop until we get a stable state. */
+	do {
+		changed = false;
+		for (d = cpuid_deps; d->feature; d++) {
+			if (!test_bit(d->depends, disable))
+				continue;
+			if (__test_and_set_bit(d->feature, disable))
+				continue;
+
+			changed = true;
+			clear_feature(c, d->feature);
+		}
+	} while (changed);
+}
+
+void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature)
+{
+	do_clear_cpu_cap(c, feature);
+}
+
+void setup_clear_cpu_cap(unsigned int feature)
+{
+	do_clear_cpu_cap(NULL, feature);
+}
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 44404e2307bb..10e74d4778a1 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -209,7 +209,7 @@ void native_machine_crash_shutdown(struct pt_regs *regs)
 }
 
 #ifdef CONFIG_KEXEC_FILE
-static int get_nr_ram_ranges_callback(u64 start, u64 end, void *arg)
+static int get_nr_ram_ranges_callback(struct resource *res, void *arg)
 {
 	unsigned int *nr_ranges = arg;
 
@@ -342,7 +342,7 @@ static int elf_header_exclude_ranges(struct crash_elf_data *ced,
 	return ret;
 }
 
-static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg)
+static int prepare_elf64_ram_headers_callback(struct resource *res, void *arg)
 {
 	struct crash_elf_data *ced = arg;
 	Elf64_Ehdr *ehdr;
@@ -355,7 +355,7 @@ static int prepare_elf64_ram_headers_callback(u64 start, u64 end, void *arg)
 	ehdr = ced->ehdr;
 
 	/* Exclude unwanted mem ranges */
-	ret = elf_header_exclude_ranges(ced, start, end);
+	ret = elf_header_exclude_ranges(ced, res->start, res->end);
 	if (ret)
 		return ret;
 
@@ -518,14 +518,14 @@ static int add_e820_entry(struct boot_params *params, struct e820_entry *entry)
 	return 0;
 }
 
-static int memmap_entry_callback(u64 start, u64 end, void *arg)
+static int memmap_entry_callback(struct resource *res, void *arg)
 {
 	struct crash_memmap_data *cmd = arg;
 	struct boot_params *params = cmd->params;
 	struct e820_entry ei;
 
-	ei.addr = start;
-	ei.size = end - start + 1;
+	ei.addr = res->start;
+	ei.size = resource_size(res);
 	ei.type = cmd->type;
 	add_e820_entry(params, &ei);
 
@@ -619,12 +619,12 @@ out:
 	return ret;
 }
 
-static int determine_backup_region(u64 start, u64 end, void *arg)
+static int determine_backup_region(struct resource *res, void *arg)
 {
 	struct kimage *image = arg;
 
-	image->arch.backup_src_start = start;
-	image->arch.backup_src_sz = end - start + 1;
+	image->arch.backup_src_start = res->start;
+	image->arch.backup_src_sz = resource_size(res);
 
 	/* Expecting only one range for backup region */
 	return 1;
diff --git a/arch/x86/kernel/fpu/init.c b/arch/x86/kernel/fpu/init.c
index 7affb7e3d9a5..6abd83572b01 100644
--- a/arch/x86/kernel/fpu/init.c
+++ b/arch/x86/kernel/fpu/init.c
@@ -249,6 +249,10 @@ static void __init fpu__init_system_ctx_switch(void)
  */
 static void __init fpu__init_parse_early_param(void)
 {
+	char arg[32];
+	char *argptr = arg;
+	int bit;
+
 	if (cmdline_find_option_bool(boot_command_line, "no387"))
 		setup_clear_cpu_cap(X86_FEATURE_FPU);
 
@@ -266,6 +270,13 @@ static void __init fpu__init_parse_early_param(void)
 
 	if (cmdline_find_option_bool(boot_command_line, "noxsaves"))
 		setup_clear_cpu_cap(X86_FEATURE_XSAVES);
+
+	if (cmdline_find_option(boot_command_line, "clearcpuid", arg,
+				sizeof(arg)) &&
+	    get_option(&argptr, &bit) &&
+	    bit >= 0 &&
+	    bit < NCAPINTS * 32)
+		setup_clear_cpu_cap(bit);
 }
 
 /*
diff --git a/arch/x86/kernel/fpu/xstate.c b/arch/x86/kernel/fpu/xstate.c
index f1d5476c9022..87a57b7642d3 100644
--- a/arch/x86/kernel/fpu/xstate.c
+++ b/arch/x86/kernel/fpu/xstate.c
@@ -15,6 +15,7 @@
 #include <asm/fpu/xstate.h>
 
 #include <asm/tlbflush.h>
+#include <asm/cpufeature.h>
 
 /*
  * Although we spell it out in here, the Processor Trace
@@ -36,6 +37,19 @@ static const char *xfeature_names[] =
 	"unknown xstate feature"	,
 };
 
+static short xsave_cpuid_features[] __initdata = {
+	X86_FEATURE_FPU,
+	X86_FEATURE_XMM,
+	X86_FEATURE_AVX,
+	X86_FEATURE_MPX,
+	X86_FEATURE_MPX,
+	X86_FEATURE_AVX512F,
+	X86_FEATURE_AVX512F,
+	X86_FEATURE_AVX512F,
+	X86_FEATURE_INTEL_PT,
+	X86_FEATURE_PKU,
+};
+
 /*
  * Mask of xstate features supported by the CPU and the kernel:
  */
@@ -59,26 +73,6 @@ unsigned int fpu_user_xstate_size;
 void fpu__xstate_clear_all_cpu_caps(void)
 {
 	setup_clear_cpu_cap(X86_FEATURE_XSAVE);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVEC);
-	setup_clear_cpu_cap(X86_FEATURE_XSAVES);
-	setup_clear_cpu_cap(X86_FEATURE_AVX);
-	setup_clear_cpu_cap(X86_FEATURE_AVX2);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512F);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512PF);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512ER);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512CD);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512DQ);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512BW);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512VL);
-	setup_clear_cpu_cap(X86_FEATURE_MPX);
-	setup_clear_cpu_cap(X86_FEATURE_XGETBV1);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI);
-	setup_clear_cpu_cap(X86_FEATURE_PKU);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS);
-	setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ);
 }
 
 /*
@@ -726,6 +720,7 @@ void __init fpu__init_system_xstate(void)
 	unsigned int eax, ebx, ecx, edx;
 	static int on_boot_cpu __initdata = 1;
 	int err;
+	int i;
 
 	WARN_ON_FPU(!on_boot_cpu);
 	on_boot_cpu = 0;
@@ -759,6 +754,14 @@ void __init fpu__init_system_xstate(void)
 		goto out_disable;
 	}
 
+	/*
+	 * Clear XSAVE features that are disabled in the normal CPUID.
+	 */
+	for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) {
+		if (!boot_cpu_has(xsave_cpuid_features[i]))
+			xfeatures_mask &= ~BIT(i);
+	}
+
 	xfeatures_mask &= fpu__get_supported_xfeatures_mask();
 
 	/* Enable xstate instructions to be able to continue with initialization: */
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f1d528bb66a6..c29020907886 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -212,9 +212,6 @@ ENTRY(startup_32_smp)
 #endif
 
 .Ldefault_entry:
-#define CR0_STATE	(X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
-			 X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
-			 X86_CR0_PG)
 	movl $(CR0_STATE & ~X86_CR0_PG),%eax
 	movl %eax,%cr0
 
@@ -402,7 +399,7 @@ ENTRY(early_idt_handler_array)
 	# 24(%rsp) error code
 	i = 0
 	.rept NUM_EXCEPTION_VECTORS
-	.ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
+	.if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
 	pushl $0		# Dummy error code, to make stack frame uniform
 	.endif
 	pushl $i		# 20(%esp) Vector number
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 6dde3f3fc1f8..7dca675fe78d 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -38,11 +38,12 @@
  *
  */
 
-#define p4d_index(x)	(((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1))
 #define pud_index(x)	(((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1))
 
+#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
 PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE)
 PGD_START_KERNEL = pgd_index(__START_KERNEL_map)
+#endif
 L3_START_KERNEL = pud_index(__START_KERNEL_map)
 
 	.text
@@ -50,6 +51,7 @@ L3_START_KERNEL = pud_index(__START_KERNEL_map)
 	.code64
 	.globl startup_64
 startup_64:
+	UNWIND_HINT_EMPTY
 	/*
 	 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
 	 * and someone has loaded an identity mapped page table
@@ -89,6 +91,7 @@ startup_64:
 	addq	$(early_top_pgt - __START_KERNEL_map), %rax
 	jmp 1f
 ENTRY(secondary_startup_64)
+	UNWIND_HINT_EMPTY
 	/*
 	 * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0,
 	 * and someone has loaded a mapped page table.
@@ -133,6 +136,7 @@ ENTRY(secondary_startup_64)
 	movq	$1f, %rax
 	jmp	*%rax
 1:
+	UNWIND_HINT_EMPTY
 
 	/* Check if nx is implemented */
 	movl	$0x80000001, %eax
@@ -150,9 +154,6 @@ ENTRY(secondary_startup_64)
 1:	wrmsr				/* Make changes effective */
 
 	/* Setup cr0 */
-#define CR0_STATE	(X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \
-			 X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \
-			 X86_CR0_PG)
 	movl	$CR0_STATE, %eax
 	/* Make changes effective */
 	movq	%rax, %cr0
@@ -235,7 +236,7 @@ ENTRY(secondary_startup_64)
 	pushq	%rax		# target address in negative space
 	lretq
 .Lafter_lret:
-ENDPROC(secondary_startup_64)
+END(secondary_startup_64)
 
 #include "verify_cpu.S"
 
@@ -247,6 +248,7 @@ ENDPROC(secondary_startup_64)
  */
 ENTRY(start_cpu0)
 	movq	initial_stack(%rip), %rsp
+	UNWIND_HINT_EMPTY
 	jmp	.Ljump_to_C_code
 ENDPROC(start_cpu0)
 #endif
@@ -266,26 +268,24 @@ ENDPROC(start_cpu0)
 	.quad  init_thread_union + THREAD_SIZE - SIZEOF_PTREGS
 	__FINITDATA
 
-bad_address:
-	jmp bad_address
-
 	__INIT
 ENTRY(early_idt_handler_array)
-	# 104(%rsp) %rflags
-	#  96(%rsp) %cs
-	#  88(%rsp) %rip
-	#  80(%rsp) error code
 	i = 0
 	.rept NUM_EXCEPTION_VECTORS
-	.ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1
-	pushq $0		# Dummy error code, to make stack frame uniform
+	.if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0
+		UNWIND_HINT_IRET_REGS
+		pushq $0	# Dummy error code, to make stack frame uniform
+	.else
+		UNWIND_HINT_IRET_REGS offset=8
 	.endif
 	pushq $i		# 72(%rsp) Vector number
 	jmp early_idt_handler_common
+	UNWIND_HINT_IRET_REGS
 	i = i + 1
 	.fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc
 	.endr
-ENDPROC(early_idt_handler_array)
+	UNWIND_HINT_IRET_REGS offset=16
+END(early_idt_handler_array)
 
 early_idt_handler_common:
 	/*
@@ -313,6 +313,7 @@ early_idt_handler_common:
 	pushq %r13				/* pt_regs->r13 */
 	pushq %r14				/* pt_regs->r14 */
 	pushq %r15				/* pt_regs->r15 */
+	UNWIND_HINT_REGS
 
 	cmpq $14,%rsi		/* Page fault? */
 	jnz 10f
@@ -327,8 +328,8 @@ early_idt_handler_common:
 
 20:
 	decl early_recursion_flag(%rip)
-	jmp restore_regs_and_iret
-ENDPROC(early_idt_handler_common)
+	jmp restore_regs_and_return_to_kernel
+END(early_idt_handler_common)
 
 	__INITDATA
 
@@ -362,10 +363,7 @@ NEXT_PAGE(early_dynamic_pgts)
 
 	.data
 
-#ifndef CONFIG_XEN
-NEXT_PAGE(init_top_pgt)
-	.fill	512,8,0
-#else
+#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH)
 NEXT_PAGE(init_top_pgt)
 	.quad   level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE_NOENC
 	.org    init_top_pgt + PGD_PAGE_OFFSET*8, 0
@@ -382,6 +380,9 @@ NEXT_PAGE(level2_ident_pgt)
 	 * Don't set NX because code runs from these pages.
 	 */
 	PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD)
+#else
+NEXT_PAGE(init_top_pgt)
+	.fill	512,8,0
 #endif
 
 #ifdef CONFIG_X86_5LEVEL
@@ -435,7 +436,7 @@ ENTRY(phys_base)
 EXPORT_SYMBOL(phys_base)
 
 #include "../../x86/xen/xen-head.S"
-	
+
 	__PAGE_ALIGNED_BSS
 NEXT_PAGE(empty_zero_page)
 	.skip PAGE_SIZE
diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c
index 8bb9594d0761..3df743b60c80 100644
--- a/arch/x86/kernel/kvm.c
+++ b/arch/x86/kernel/kvm.c
@@ -75,8 +75,8 @@ static int parse_no_kvmclock_vsyscall(char *arg)
 
 early_param("no-kvmclock-vsyscall", parse_no_kvmclock_vsyscall);
 
-static DEFINE_PER_CPU(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
-static DEFINE_PER_CPU(struct kvm_steal_time, steal_time) __aligned(64);
+static DEFINE_PER_CPU_DECRYPTED(struct kvm_vcpu_pv_apf_data, apf_reason) __aligned(64);
+static DEFINE_PER_CPU_DECRYPTED(struct kvm_steal_time, steal_time) __aligned(64);
 static int has_steal_clock = 0;
 
 /*
@@ -312,7 +312,7 @@ static void kvm_register_steal_time(void)
 		cpu, (unsigned long long) slow_virt_to_phys(st));
 }
 
-static DEFINE_PER_CPU(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
+static DEFINE_PER_CPU_DECRYPTED(unsigned long, kvm_apic_eoi) = KVM_PV_EOI_DISABLED;
 
 static notrace void kvm_guest_apic_eoi_write(u32 reg, u32 val)
 {
@@ -426,9 +426,42 @@ void kvm_disable_steal_time(void)
 	wrmsr(MSR_KVM_STEAL_TIME, 0, 0);
 }
 
+static inline void __set_percpu_decrypted(void *ptr, unsigned long size)
+{
+	early_set_memory_decrypted((unsigned long) ptr, size);
+}
+
+/*
+ * Iterate through all possible CPUs and map the memory region pointed
+ * by apf_reason, steal_time and kvm_apic_eoi as decrypted at once.
+ *
+ * Note: we iterate through all possible CPUs to ensure that CPUs
+ * hotplugged will have their per-cpu variable already mapped as
+ * decrypted.
+ */
+static void __init sev_map_percpu_data(void)
+{
+	int cpu;
+
+	if (!sev_active())
+		return;
+
+	for_each_possible_cpu(cpu) {
+		__set_percpu_decrypted(&per_cpu(apf_reason, cpu), sizeof(apf_reason));
+		__set_percpu_decrypted(&per_cpu(steal_time, cpu), sizeof(steal_time));
+		__set_percpu_decrypted(&per_cpu(kvm_apic_eoi, cpu), sizeof(kvm_apic_eoi));
+	}
+}
+
 #ifdef CONFIG_SMP
 static void __init kvm_smp_prepare_boot_cpu(void)
 {
+	/*
+	 * Map the per-cpu variables as decrypted before kvm_guest_cpu_init()
+	 * shares the guest physical address with the hypervisor.
+	 */
+	sev_map_percpu_data();
+
 	kvm_guest_cpu_init();
 	native_smp_prepare_boot_cpu();
 	kvm_spinlock_init();
@@ -496,6 +529,7 @@ void __init kvm_guest_init(void)
 				      kvm_cpu_online, kvm_cpu_down_prepare) < 0)
 		pr_err("kvm_guest: Failed to install cpu hotplug callbacks\n");
 #else
+	sev_map_percpu_data();
 	kvm_guest_cpu_init();
 #endif
 
diff --git a/arch/x86/kernel/kvmclock.c b/arch/x86/kernel/kvmclock.c
index 5b609e28ce3f..77b492c2d658 100644
--- a/arch/x86/kernel/kvmclock.c
+++ b/arch/x86/kernel/kvmclock.c
@@ -27,6 +27,7 @@
 #include <linux/sched.h>
 #include <linux/sched/clock.h>
 
+#include <asm/mem_encrypt.h>
 #include <asm/x86_init.h>
 #include <asm/reboot.h>
 #include <asm/kvmclock.h>
@@ -45,7 +46,7 @@ early_param("no-kvmclock", parse_no_kvmclock);
 
 /* The hypervisor will put information about time periodically here */
 static struct pvclock_vsyscall_time_info *hv_clock;
-static struct pvclock_wall_clock wall_clock;
+static struct pvclock_wall_clock *wall_clock;
 
 struct pvclock_vsyscall_time_info *pvclock_pvti_cpu0_va(void)
 {
@@ -64,15 +65,15 @@ static void kvm_get_wallclock(struct timespec *now)
 	int low, high;
 	int cpu;
 
-	low = (int)__pa_symbol(&wall_clock);
-	high = ((u64)__pa_symbol(&wall_clock) >> 32);
+	low = (int)slow_virt_to_phys(wall_clock);
+	high = ((u64)slow_virt_to_phys(wall_clock) >> 32);
 
 	native_write_msr(msr_kvm_wall_clock, low, high);
 
 	cpu = get_cpu();
 
 	vcpu_time = &hv_clock[cpu].pvti;
-	pvclock_read_wallclock(&wall_clock, vcpu_time, now);
+	pvclock_read_wallclock(wall_clock, vcpu_time, now);
 
 	put_cpu();
 }
@@ -249,11 +250,39 @@ static void kvm_shutdown(void)
 	native_machine_shutdown();
 }
 
+static phys_addr_t __init kvm_memblock_alloc(phys_addr_t size,
+					     phys_addr_t align)
+{
+	phys_addr_t mem;
+
+	mem = memblock_alloc(size, align);
+	if (!mem)
+		return 0;
+
+	if (sev_active()) {
+		if (early_set_memory_decrypted((unsigned long)__va(mem), size))
+			goto e_free;
+	}
+
+	return mem;
+e_free:
+	memblock_free(mem, size);
+	return 0;
+}
+
+static void __init kvm_memblock_free(phys_addr_t addr, phys_addr_t size)
+{
+	if (sev_active())
+		early_set_memory_encrypted((unsigned long)__va(addr), size);
+
+	memblock_free(addr, size);
+}
+
 void __init kvmclock_init(void)
 {
 	struct pvclock_vcpu_time_info *vcpu_time;
-	unsigned long mem;
-	int size, cpu;
+	unsigned long mem, mem_wall_clock;
+	int size, cpu, wall_clock_size;
 	u8 flags;
 
 	size = PAGE_ALIGN(sizeof(struct pvclock_vsyscall_time_info)*NR_CPUS);
@@ -267,21 +296,35 @@ void __init kvmclock_init(void)
 	} else if (!(kvmclock && kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE)))
 		return;
 
-	printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
-		msr_kvm_system_time, msr_kvm_wall_clock);
+	wall_clock_size = PAGE_ALIGN(sizeof(struct pvclock_wall_clock));
+	mem_wall_clock = kvm_memblock_alloc(wall_clock_size, PAGE_SIZE);
+	if (!mem_wall_clock)
+		return;
 
-	mem = memblock_alloc(size, PAGE_SIZE);
-	if (!mem)
+	wall_clock = __va(mem_wall_clock);
+	memset(wall_clock, 0, wall_clock_size);
+
+	mem = kvm_memblock_alloc(size, PAGE_SIZE);
+	if (!mem) {
+		kvm_memblock_free(mem_wall_clock, wall_clock_size);
+		wall_clock = NULL;
 		return;
+	}
+
 	hv_clock = __va(mem);
 	memset(hv_clock, 0, size);
 
 	if (kvm_register_clock("primary cpu clock")) {
 		hv_clock = NULL;
-		memblock_free(mem, size);
+		kvm_memblock_free(mem, size);
+		kvm_memblock_free(mem_wall_clock, wall_clock_size);
+		wall_clock = NULL;
 		return;
 	}
 
+	printk(KERN_INFO "kvm-clock: Using msrs %x and %x",
+		msr_kvm_system_time, msr_kvm_wall_clock);
+
 	if (kvm_para_has_feature(KVM_FEATURE_CLOCKSOURCE_STABLE_BIT))
 		pvclock_set_flags(PVCLOCK_TSC_STABLE_BIT);
 
diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c
index a2fcf037bd80..1c1eae961340 100644
--- a/arch/x86/kernel/ldt.c
+++ b/arch/x86/kernel/ldt.c
@@ -13,6 +13,7 @@
 #include <linux/string.h>
 #include <linux/mm.h>
 #include <linux/smp.h>
+#include <linux/syscalls.h>
 #include <linux/slab.h>
 #include <linux/vmalloc.h>
 #include <linux/uaccess.h>
@@ -295,8 +296,8 @@ out:
 	return error;
 }
 
-asmlinkage int sys_modify_ldt(int func, void __user *ptr,
-			      unsigned long bytecount)
+SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
+		unsigned long , bytecount)
 {
 	int ret = -ENOSYS;
 
@@ -314,5 +315,14 @@ asmlinkage int sys_modify_ldt(int func, void __user *ptr,
 		ret = write_ldt(ptr, bytecount, 0);
 		break;
 	}
-	return ret;
+	/*
+	 * The SYSCALL_DEFINE() macros give us an 'unsigned long'
+	 * return type, but tht ABI for sys_modify_ldt() expects
+	 * 'int'.  This cast gives us an int-sized value in %rax
+	 * for the return code.  The 'unsigned' is necessary so
+	 * the compiler does not try to sign-extend the negative
+	 * return codes into the high half of the register when
+	 * taking the value from int->long.
+	 */
+	return (unsigned int)ret;
 }
diff --git a/arch/x86/kernel/pmem.c b/arch/x86/kernel/pmem.c
index 3fe690067802..6b07faaa1579 100644
--- a/arch/x86/kernel/pmem.c
+++ b/arch/x86/kernel/pmem.c
@@ -7,7 +7,7 @@
 #include <linux/init.h>
 #include <linux/ioport.h>
 
-static int found(u64 start, u64 end, void *data)
+static int found(struct resource *res, void *data)
 {
 	return 1;
 }
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index c67685337c5a..97fb3e5737f5 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -49,7 +49,13 @@
  */
 __visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
 	.x86_tss = {
-		.sp0 = TOP_OF_INIT_STACK,
+		/*
+		 * .sp0 is only used when entering ring 0 from a lower
+		 * privilege level.  Since the init task never runs anything
+		 * but ring 0 code, there is no need for a valid value here.
+		 * Poison it.
+		 */
+		.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
 #ifdef CONFIG_X86_32
 		.ss0 = __KERNEL_DS,
 		.ss1 = __KERNEL_CS,
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 11966251cd42..45bf0c5f93e1 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -284,9 +284,11 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 
 	/*
 	 * Reload esp0 and cpu_current_top_of_stack.  This changes
-	 * current_thread_info().
+	 * current_thread_info().  Refresh the SYSENTER configuration in
+	 * case prev or next is vm86.
 	 */
-	load_sp0(tss, next);
+	update_sp0(next_p);
+	refresh_sysenter_cs(next);
 	this_cpu_write(cpu_current_top_of_stack,
 		       (unsigned long)task_stack_page(next_p) +
 		       THREAD_SIZE);
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 302e7b2572d1..eeeb34f85c25 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -274,7 +274,6 @@ int copy_thread_tls(unsigned long clone_flags, unsigned long sp,
 	struct inactive_task_frame *frame;
 	struct task_struct *me = current;
 
-	p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
 	childregs = task_pt_regs(p);
 	fork_frame = container_of(childregs, struct fork_frame, regs);
 	frame = &fork_frame->frame;
@@ -464,8 +463,8 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
 	 */
 	this_cpu_write(current_task, next_p);
 
-	/* Reload esp0 and ss1.  This changes current_thread_info(). */
-	load_sp0(tss, next);
+	/* Reload sp0. */
+	update_sp0(next_p);
 
 	/*
 	 * Now maybe reload the debug registers and handle I/O bitmaps
diff --git a/arch/x86/kernel/setup.c b/arch/x86/kernel/setup.c
index 0957dd73d127..507100a72eb3 100644
--- a/arch/x86/kernel/setup.c
+++ b/arch/x86/kernel/setup.c
@@ -380,9 +380,11 @@ static void __init reserve_initrd(void)
 	 * If SME is active, this memory will be marked encrypted by the
 	 * kernel when it is accessed (including relocation). However, the
 	 * ramdisk image was loaded decrypted by the bootloader, so make
-	 * sure that it is encrypted before accessing it.
+	 * sure that it is encrypted before accessing it. For SEV the
+	 * ramdisk will already be encrypted, so only do this for SME.
 	 */
-	sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image);
+	if (sme_active())
+		sme_early_encrypt(ramdisk_image, ramdisk_end - ramdisk_image);
 
 	initrd_start = 0;
 
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index 7ab6db86b573..88fc3a51640c 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -963,8 +963,7 @@ void common_cpu_up(unsigned int cpu, struct task_struct *idle)
 #ifdef CONFIG_X86_32
 	/* Stack for startup_32 can be just as for start_secondary onwards */
 	irq_ctx_init(cpu);
-	per_cpu(cpu_current_top_of_stack, cpu) =
-		(unsigned long)task_stack_page(idle) + THREAD_SIZE;
+	per_cpu(cpu_current_top_of_stack, cpu) = task_top_of_stack(idle);
 #else
 	initial_gs = per_cpu_offset(cpu);
 #endif
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 5a6b8f809792..c3aca0b533a1 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -60,6 +60,7 @@
 #include <asm/trace/mpx.h>
 #include <asm/mpx.h>
 #include <asm/vm86.h>
+#include <asm/umip.h>
 
 #ifdef CONFIG_X86_64
 #include <asm/x86_init.h>
@@ -141,8 +142,7 @@ void ist_begin_non_atomic(struct pt_regs *regs)
 	 * will catch asm bugs and any attempt to use ist_preempt_enable
 	 * from double_fault.
 	 */
-	BUG_ON((unsigned long)(current_top_of_stack() -
-			       current_stack_pointer) >= THREAD_SIZE);
+	BUG_ON(!on_thread_stack());
 
 	preempt_enable_no_resched();
 }
@@ -518,6 +518,11 @@ do_general_protection(struct pt_regs *regs, long error_code)
 	RCU_LOCKDEP_WARN(!rcu_is_watching(), "entry code didn't wake RCU");
 	cond_local_irq_enable(regs);
 
+	if (static_cpu_has(X86_FEATURE_UMIP)) {
+		if (user_mode(regs) && fixup_umip_exception(regs))
+			return;
+	}
+
 	if (v8086_mode(regs)) {
 		local_irq_enable();
 		handle_vm86_fault((struct kernel_vm86_regs *) regs, error_code);
diff --git a/arch/x86/kernel/umip.c b/arch/x86/kernel/umip.c
new file mode 100644
index 000000000000..6ba82be68cff
--- /dev/null
+++ b/arch/x86/kernel/umip.c
@@ -0,0 +1,366 @@
+/*
+ * umip.c Emulation for instruction protected by the Intel User-Mode
+ * Instruction Prevention feature
+ *
+ * Copyright (c) 2017, Intel Corporation.
+ * Ricardo Neri <ricardo.neri-calderon@linux.intel.com>
+ */
+
+#include <linux/uaccess.h>
+#include <asm/umip.h>
+#include <asm/traps.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <linux/ratelimit.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "umip: " fmt
+
+/** DOC: Emulation for User-Mode Instruction Prevention (UMIP)
+ *
+ * The feature User-Mode Instruction Prevention present in recent Intel
+ * processor prevents a group of instructions (sgdt, sidt, sldt, smsw, and str)
+ * from being executed with CPL > 0. Otherwise, a general protection fault is
+ * issued.
+ *
+ * Rather than relaying to the user space the general protection fault caused by
+ * the UMIP-protected instructions (in the form of a SIGSEGV signal), it can be
+ * trapped and emulate the result of such instructions to provide dummy values.
+ * This allows to both conserve the current kernel behavior and not reveal the
+ * system resources that UMIP intends to protect (i.e., the locations of the
+ * global descriptor and interrupt descriptor tables, the segment selectors of
+ * the local descriptor table, the value of the task state register and the
+ * contents of the CR0 register).
+ *
+ * This emulation is needed because certain applications (e.g., WineHQ and
+ * DOSEMU2) rely on this subset of instructions to function.
+ *
+ * The instructions protected by UMIP can be split in two groups. Those which
+ * return a kernel memory address (sgdt and sidt) and those which return a
+ * value (sldt, str and smsw).
+ *
+ * For the instructions that return a kernel memory address, applications
+ * such as WineHQ rely on the result being located in the kernel memory space,
+ * not the actual location of the table. The result is emulated as a hard-coded
+ * value that, lies close to the top of the kernel memory. The limit for the GDT
+ * and the IDT are set to zero.
+ *
+ * Given that sldt and str are not commonly used in programs that run on WineHQ
+ * or DOSEMU2, they are not emulated.
+ *
+ * The instruction smsw is emulated to return the value that the register CR0
+ * has at boot time as set in the head_32.
+ *
+ * Also, emulation is provided only for 32-bit processes; 64-bit processes
+ * that attempt to use the instructions that UMIP protects will receive the
+ * SIGSEGV signal issued as a consequence of the general protection fault.
+ *
+ * Care is taken to appropriately emulate the results when segmentation is
+ * used. That is, rather than relying on USER_DS and USER_CS, the function
+ * insn_get_addr_ref() inspects the segment descriptor pointed by the
+ * registers in pt_regs. This ensures that we correctly obtain the segment
+ * base address and the address and operand sizes even if the user space
+ * application uses a local descriptor table.
+ */
+
+#define UMIP_DUMMY_GDT_BASE 0xfffe0000
+#define UMIP_DUMMY_IDT_BASE 0xffff0000
+
+/*
+ * The SGDT and SIDT instructions store the contents of the global descriptor
+ * table and interrupt table registers, respectively. The destination is a
+ * memory operand of X+2 bytes. X bytes are used to store the base address of
+ * the table and 2 bytes are used to store the limit. In 32-bit processes, the
+ * only processes for which emulation is provided, X has a value of 4.
+ */
+#define UMIP_GDT_IDT_BASE_SIZE 4
+#define UMIP_GDT_IDT_LIMIT_SIZE 2
+
+#define	UMIP_INST_SGDT	0	/* 0F 01 /0 */
+#define	UMIP_INST_SIDT	1	/* 0F 01 /1 */
+#define	UMIP_INST_SMSW	3	/* 0F 01 /4 */
+
+/**
+ * identify_insn() - Identify a UMIP-protected instruction
+ * @insn:	Instruction structure with opcode and ModRM byte.
+ *
+ * From the opcode and ModRM.reg in @insn identify, if any, a UMIP-protected
+ * instruction that can be emulated.
+ *
+ * Returns:
+ *
+ * On success, a constant identifying a specific UMIP-protected instruction that
+ * can be emulated.
+ *
+ * -EINVAL on error or when not an UMIP-protected instruction that can be
+ * emulated.
+ */
+static int identify_insn(struct insn *insn)
+{
+	/* By getting modrm we also get the opcode. */
+	insn_get_modrm(insn);
+
+	if (!insn->modrm.nbytes)
+		return -EINVAL;
+
+	/* All the instructions of interest start with 0x0f. */
+	if (insn->opcode.bytes[0] != 0xf)
+		return -EINVAL;
+
+	if (insn->opcode.bytes[1] == 0x1) {
+		switch (X86_MODRM_REG(insn->modrm.value)) {
+		case 0:
+			return UMIP_INST_SGDT;
+		case 1:
+			return UMIP_INST_SIDT;
+		case 4:
+			return UMIP_INST_SMSW;
+		default:
+			return -EINVAL;
+		}
+	}
+
+	/* SLDT AND STR are not emulated */
+	return -EINVAL;
+}
+
+/**
+ * emulate_umip_insn() - Emulate UMIP instructions and return dummy values
+ * @insn:	Instruction structure with operands
+ * @umip_inst:	A constant indicating the instruction to emulate
+ * @data:	Buffer into which the dummy result is stored
+ * @data_size:	Size of the emulated result
+ *
+ * Emulate an instruction protected by UMIP and provide a dummy result. The
+ * result of the emulation is saved in @data. The size of the results depends
+ * on both the instruction and type of operand (register vs memory address).
+ * The size of the result is updated in @data_size. Caller is responsible
+ * of providing a @data buffer of at least UMIP_GDT_IDT_BASE_SIZE +
+ * UMIP_GDT_IDT_LIMIT_SIZE bytes.
+ *
+ * Returns:
+ *
+ * 0 on success, -EINVAL on error while emulating.
+ */
+static int emulate_umip_insn(struct insn *insn, int umip_inst,
+			     unsigned char *data, int *data_size)
+{
+	unsigned long dummy_base_addr, dummy_value;
+	unsigned short dummy_limit = 0;
+
+	if (!data || !data_size || !insn)
+		return -EINVAL;
+	/*
+	 * These two instructions return the base address and limit of the
+	 * global and interrupt descriptor table, respectively. According to the
+	 * Intel Software Development manual, the base address can be 24-bit,
+	 * 32-bit or 64-bit. Limit is always 16-bit. If the operand size is
+	 * 16-bit, the returned value of the base address is supposed to be a
+	 * zero-extended 24-byte number. However, it seems that a 32-byte number
+	 * is always returned irrespective of the operand size.
+	 */
+	if (umip_inst == UMIP_INST_SGDT || umip_inst == UMIP_INST_SIDT) {
+		/* SGDT and SIDT do not use registers operands. */
+		if (X86_MODRM_MOD(insn->modrm.value) == 3)
+			return -EINVAL;
+
+		if (umip_inst == UMIP_INST_SGDT)
+			dummy_base_addr = UMIP_DUMMY_GDT_BASE;
+		else
+			dummy_base_addr = UMIP_DUMMY_IDT_BASE;
+
+		*data_size = UMIP_GDT_IDT_LIMIT_SIZE + UMIP_GDT_IDT_BASE_SIZE;
+
+		memcpy(data + 2, &dummy_base_addr, UMIP_GDT_IDT_BASE_SIZE);
+		memcpy(data, &dummy_limit, UMIP_GDT_IDT_LIMIT_SIZE);
+
+	} else if (umip_inst == UMIP_INST_SMSW) {
+		dummy_value = CR0_STATE;
+
+		/*
+		 * Even though the CR0 register has 4 bytes, the number
+		 * of bytes to be copied in the result buffer is determined
+		 * by whether the operand is a register or a memory location.
+		 * If operand is a register, return as many bytes as the operand
+		 * size. If operand is memory, return only the two least
+		 * siginificant bytes of CR0.
+		 */
+		if (X86_MODRM_MOD(insn->modrm.value) == 3)
+			*data_size = insn->opnd_bytes;
+		else
+			*data_size = 2;
+
+		memcpy(data, &dummy_value, *data_size);
+	/* STR and SLDT  are not emulated */
+	} else {
+		return -EINVAL;
+	}
+
+	return 0;
+}
+
+/**
+ * force_sig_info_umip_fault() - Force a SIGSEGV with SEGV_MAPERR
+ * @addr:	Address that caused the signal
+ * @regs:	Register set containing the instruction pointer
+ *
+ * Force a SIGSEGV signal with SEGV_MAPERR as the error code. This function is
+ * intended to be used to provide a segmentation fault when the result of the
+ * UMIP emulation could not be copied to the user space memory.
+ *
+ * Returns: none
+ */
+static void force_sig_info_umip_fault(void __user *addr, struct pt_regs *regs)
+{
+	siginfo_t info;
+	struct task_struct *tsk = current;
+
+	tsk->thread.cr2		= (unsigned long)addr;
+	tsk->thread.error_code	= X86_PF_USER | X86_PF_WRITE;
+	tsk->thread.trap_nr	= X86_TRAP_PF;
+
+	info.si_signo	= SIGSEGV;
+	info.si_errno	= 0;
+	info.si_code	= SEGV_MAPERR;
+	info.si_addr	= addr;
+	force_sig_info(SIGSEGV, &info, tsk);
+
+	if (!(show_unhandled_signals && unhandled_signal(tsk, SIGSEGV)))
+		return;
+
+	pr_err_ratelimited("%s[%d] umip emulation segfault ip:%lx sp:%lx error:%x in %lx\n",
+			   tsk->comm, task_pid_nr(tsk), regs->ip,
+			   regs->sp, X86_PF_USER | X86_PF_WRITE,
+			   regs->ip);
+}
+
+/**
+ * fixup_umip_exception() - Fixup a general protection fault caused by UMIP
+ * @regs:	Registers as saved when entering the #GP handler
+ *
+ * The instructions sgdt, sidt, str, smsw, sldt cause a general protection
+ * fault if executed with CPL > 0 (i.e., from user space). If the offending
+ * user-space process is not in long mode, this function fixes the exception
+ * up and provides dummy results for sgdt, sidt and smsw; str and sldt are not
+ * fixed up. Also long mode user-space processes are not fixed up.
+ *
+ * If operands are memory addresses, results are copied to user-space memory as
+ * indicated by the instruction pointed by eIP using the registers indicated in
+ * the instruction operands. If operands are registers, results are copied into
+ * the context that was saved when entering kernel mode.
+ *
+ * Returns:
+ *
+ * True if emulation was successful; false if not.
+ */
+bool fixup_umip_exception(struct pt_regs *regs)
+{
+	int not_copied, nr_copied, reg_offset, dummy_data_size, umip_inst;
+	unsigned long seg_base = 0, *reg_addr;
+	/* 10 bytes is the maximum size of the result of UMIP instructions */
+	unsigned char dummy_data[10] = { 0 };
+	unsigned char buf[MAX_INSN_SIZE];
+	void __user *uaddr;
+	struct insn insn;
+	char seg_defs;
+
+	if (!regs)
+		return false;
+
+	/* Do not emulate 64-bit processes. */
+	if (user_64bit_mode(regs))
+		return false;
+
+	/*
+	 * If not in user-space long mode, a custom code segment could be in
+	 * use. This is true in protected mode (if the process defined a local
+	 * descriptor table), or virtual-8086 mode. In most of the cases
+	 * seg_base will be zero as in USER_CS.
+	 */
+	if (!user_64bit_mode(regs))
+		seg_base = insn_get_seg_base(regs, INAT_SEG_REG_CS);
+
+	if (seg_base == -1L)
+		return false;
+
+	not_copied = copy_from_user(buf, (void __user *)(seg_base + regs->ip),
+				    sizeof(buf));
+	nr_copied = sizeof(buf) - not_copied;
+
+	/*
+	 * The copy_from_user above could have failed if user code is protected
+	 * by a memory protection key. Give up on emulation in such a case.
+	 * Should we issue a page fault?
+	 */
+	if (!nr_copied)
+		return false;
+
+	insn_init(&insn, buf, nr_copied, user_64bit_mode(regs));
+
+	/*
+	 * Override the default operand and address sizes with what is specified
+	 * in the code segment descriptor. The instruction decoder only sets
+	 * the address size it to either 4 or 8 address bytes and does nothing
+	 * for the operand bytes. This OK for most of the cases, but we could
+	 * have special cases where, for instance, a 16-bit code segment
+	 * descriptor is used.
+	 * If there is an address override prefix, the instruction decoder
+	 * correctly updates these values, even for 16-bit defaults.
+	 */
+	seg_defs = insn_get_code_seg_params(regs);
+	if (seg_defs == -EINVAL)
+		return false;
+
+	insn.addr_bytes = INSN_CODE_SEG_ADDR_SZ(seg_defs);
+	insn.opnd_bytes = INSN_CODE_SEG_OPND_SZ(seg_defs);
+
+	insn_get_length(&insn);
+	if (nr_copied < insn.length)
+		return false;
+
+	umip_inst = identify_insn(&insn);
+	if (umip_inst < 0)
+		return false;
+
+	if (emulate_umip_insn(&insn, umip_inst, dummy_data, &dummy_data_size))
+		return false;
+
+	/*
+	 * If operand is a register, write result to the copy of the register
+	 * value that was pushed to the stack when entering into kernel mode.
+	 * Upon exit, the value we write will be restored to the actual hardware
+	 * register.
+	 */
+	if (X86_MODRM_MOD(insn.modrm.value) == 3) {
+		reg_offset = insn_get_modrm_rm_off(&insn, regs);
+
+		/*
+		 * Negative values are usually errors. In memory addressing,
+		 * the exception is -EDOM. Since we expect a register operand,
+		 * all negative values are errors.
+		 */
+		if (reg_offset < 0)
+			return false;
+
+		reg_addr = (unsigned long *)((unsigned long)regs + reg_offset);
+		memcpy(reg_addr, dummy_data, dummy_data_size);
+	} else {
+		uaddr = insn_get_addr_ref(&insn, regs);
+		if ((unsigned long)uaddr == -1L)
+			return false;
+
+		nr_copied = copy_to_user(uaddr, dummy_data, dummy_data_size);
+		if (nr_copied  > 0) {
+			/*
+			 * If copy fails, send a signal and tell caller that
+			 * fault was fixed up.
+			 */
+			force_sig_info_umip_fault(uaddr, regs);
+			return true;
+		}
+	}
+
+	/* increase IP to let the program keep going */
+	regs->ip += insn.length;
+	return true;
+}
diff --git a/arch/x86/kernel/uprobes.c b/arch/x86/kernel/uprobes.c
index 495c776de4b4..a3755d293a48 100644
--- a/arch/x86/kernel/uprobes.c
+++ b/arch/x86/kernel/uprobes.c
@@ -271,12 +271,15 @@ static bool is_prefix_bad(struct insn *insn)
 	int i;
 
 	for (i = 0; i < insn->prefixes.nbytes; i++) {
-		switch (insn->prefixes.bytes[i]) {
-		case 0x26:	/* INAT_PFX_ES   */
-		case 0x2E:	/* INAT_PFX_CS   */
-		case 0x36:	/* INAT_PFX_DS   */
-		case 0x3E:	/* INAT_PFX_SS   */
-		case 0xF0:	/* INAT_PFX_LOCK */
+		insn_attr_t attr;
+
+		attr = inat_get_opcode_attribute(insn->prefixes.bytes[i]);
+		switch (attr) {
+		case INAT_MAKE_PREFIX(INAT_PFX_ES):
+		case INAT_MAKE_PREFIX(INAT_PFX_CS):
+		case INAT_MAKE_PREFIX(INAT_PFX_DS):
+		case INAT_MAKE_PREFIX(INAT_PFX_SS):
+		case INAT_MAKE_PREFIX(INAT_PFX_LOCK):
 			return true;
 		}
 	}
diff --git a/arch/x86/kernel/verify_cpu.S b/arch/x86/kernel/verify_cpu.S
index 014ea59aa153..3d3c2f71f617 100644
--- a/arch/x86/kernel/verify_cpu.S
+++ b/arch/x86/kernel/verify_cpu.S
@@ -33,7 +33,7 @@
 #include <asm/cpufeatures.h>
 #include <asm/msr-index.h>
 
-verify_cpu:
+ENTRY(verify_cpu)
 	pushf				# Save caller passed flags
 	push	$0			# Kill any dangerous flags
 	popf
@@ -139,3 +139,4 @@ verify_cpu:
 	popf				# Restore caller passed flags
 	xorl %eax, %eax
 	ret
+ENDPROC(verify_cpu)
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index 68244742ecb0..5edb27f1a2c4 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -55,6 +55,7 @@
 #include <asm/irq.h>
 #include <asm/traps.h>
 #include <asm/vm86.h>
+#include <asm/switch_to.h>
 
 /*
  * Known problems:
@@ -94,7 +95,6 @@
 
 void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 {
-	struct tss_struct *tss;
 	struct task_struct *tsk = current;
 	struct vm86plus_struct __user *user;
 	struct vm86 *vm86 = current->thread.vm86;
@@ -146,12 +146,13 @@ void save_v86_state(struct kernel_vm86_regs *regs, int retval)
 		do_exit(SIGSEGV);
 	}
 
-	tss = &per_cpu(cpu_tss, get_cpu());
+	preempt_disable();
 	tsk->thread.sp0 = vm86->saved_sp0;
 	tsk->thread.sysenter_cs = __KERNEL_CS;
-	load_sp0(tss, &tsk->thread);
+	update_sp0(tsk);
+	refresh_sysenter_cs(&tsk->thread);
 	vm86->saved_sp0 = 0;
-	put_cpu();
+	preempt_enable();
 
 	memcpy(&regs->pt, &vm86->regs32, sizeof(struct pt_regs));
 
@@ -237,7 +238,6 @@ SYSCALL_DEFINE2(vm86, unsigned long, cmd, unsigned long, arg)
 
 static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 {
-	struct tss_struct *tss;
 	struct task_struct *tsk = current;
 	struct vm86 *vm86 = tsk->thread.vm86;
 	struct kernel_vm86_regs vm86regs;
@@ -365,15 +365,17 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus)
 	vm86->saved_sp0 = tsk->thread.sp0;
 	lazy_save_gs(vm86->regs32.gs);
 
-	tss = &per_cpu(cpu_tss, get_cpu());
 	/* make room for real-mode segments */
+	preempt_disable();
 	tsk->thread.sp0 += 16;
 
-	if (static_cpu_has(X86_FEATURE_SEP))
+	if (static_cpu_has(X86_FEATURE_SEP)) {
 		tsk->thread.sysenter_cs = 0;
+		refresh_sysenter_cs(&tsk->thread);
+	}
 
-	load_sp0(tss, &tsk->thread);
-	put_cpu();
+	update_sp0(tsk);
+	preempt_enable();
 
 	if (vm86->flags & VM86_SCREEN_BITMAP)
 		mark_screen_rdonly(tsk->mm);
diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile
index 457f681ef379..7b181b61170e 100644
--- a/arch/x86/lib/Makefile
+++ b/arch/x86/lib/Makefile
@@ -24,7 +24,7 @@ lib-y := delay.o misc.o cmdline.o cpu.o
 lib-y += usercopy_$(BITS).o usercopy.o getuser.o putuser.o
 lib-y += memcpy_$(BITS).o
 lib-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem.o
-lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o
+lib-$(CONFIG_INSTRUCTION_DECODER) += insn.o inat.o insn-eval.o
 lib-$(CONFIG_RANDOMIZE_BASE) += kaslr.o
 
 obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o
diff --git a/arch/x86/lib/insn-eval.c b/arch/x86/lib/insn-eval.c
new file mode 100644
index 000000000000..35625d279458
--- /dev/null
+++ b/arch/x86/lib/insn-eval.c
@@ -0,0 +1,1364 @@
+/*
+ * Utility functions for x86 operand and address decoding
+ *
+ * Copyright (C) Intel Corporation 2017
+ */
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/ratelimit.h>
+#include <linux/mmu_context.h>
+#include <asm/desc_defs.h>
+#include <asm/desc.h>
+#include <asm/inat.h>
+#include <asm/insn.h>
+#include <asm/insn-eval.h>
+#include <asm/ldt.h>
+#include <asm/vm86.h>
+
+#undef pr_fmt
+#define pr_fmt(fmt) "insn: " fmt
+
+enum reg_type {
+	REG_TYPE_RM = 0,
+	REG_TYPE_INDEX,
+	REG_TYPE_BASE,
+};
+
+/**
+ * is_string_insn() - Determine if instruction is a string instruction
+ * @insn:	Instruction containing the opcode to inspect
+ *
+ * Returns:
+ *
+ * true if the instruction, determined by the opcode, is any of the
+ * string instructions as defined in the Intel Software Development manual.
+ * False otherwise.
+ */
+static bool is_string_insn(struct insn *insn)
+{
+	insn_get_opcode(insn);
+
+	/* All string instructions have a 1-byte opcode. */
+	if (insn->opcode.nbytes != 1)
+		return false;
+
+	switch (insn->opcode.bytes[0]) {
+	case 0x6c ... 0x6f:	/* INS, OUTS */
+	case 0xa4 ... 0xa7:	/* MOVS, CMPS */
+	case 0xaa ... 0xaf:	/* STOS, LODS, SCAS */
+		return true;
+	default:
+		return false;
+	}
+}
+
+/**
+ * get_seg_reg_override_idx() - obtain segment register override index
+ * @insn:	Valid instruction with segment override prefixes
+ *
+ * Inspect the instruction prefixes in @insn and find segment overrides, if any.
+ *
+ * Returns:
+ *
+ * A constant identifying the segment register to use, among CS, SS, DS,
+ * ES, FS, or GS. INAT_SEG_REG_DEFAULT is returned if no segment override
+ * prefixes were found.
+ *
+ * -EINVAL in case of error.
+ */
+static int get_seg_reg_override_idx(struct insn *insn)
+{
+	int idx = INAT_SEG_REG_DEFAULT;
+	int num_overrides = 0, i;
+
+	insn_get_prefixes(insn);
+
+	/* Look for any segment override prefixes. */
+	for (i = 0; i < insn->prefixes.nbytes; i++) {
+		insn_attr_t attr;
+
+		attr = inat_get_opcode_attribute(insn->prefixes.bytes[i]);
+		switch (attr) {
+		case INAT_MAKE_PREFIX(INAT_PFX_CS):
+			idx = INAT_SEG_REG_CS;
+			num_overrides++;
+			break;
+		case INAT_MAKE_PREFIX(INAT_PFX_SS):
+			idx = INAT_SEG_REG_SS;
+			num_overrides++;
+			break;
+		case INAT_MAKE_PREFIX(INAT_PFX_DS):
+			idx = INAT_SEG_REG_DS;
+			num_overrides++;
+			break;
+		case INAT_MAKE_PREFIX(INAT_PFX_ES):
+			idx = INAT_SEG_REG_ES;
+			num_overrides++;
+			break;
+		case INAT_MAKE_PREFIX(INAT_PFX_FS):
+			idx = INAT_SEG_REG_FS;
+			num_overrides++;
+			break;
+		case INAT_MAKE_PREFIX(INAT_PFX_GS):
+			idx = INAT_SEG_REG_GS;
+			num_overrides++;
+			break;
+		/* No default action needed. */
+		}
+	}
+
+	/* More than one segment override prefix leads to undefined behavior. */
+	if (num_overrides > 1)
+		return -EINVAL;
+
+	return idx;
+}
+
+/**
+ * check_seg_overrides() - check if segment override prefixes are allowed
+ * @insn:	Valid instruction with segment override prefixes
+ * @regoff:	Operand offset, in pt_regs, for which the check is performed
+ *
+ * For a particular register used in register-indirect addressing, determine if
+ * segment override prefixes can be used. Specifically, no overrides are allowed
+ * for rDI if used with a string instruction.
+ *
+ * Returns:
+ *
+ * True if segment override prefixes can be used with the register indicated
+ * in @regoff. False if otherwise.
+ */
+static bool check_seg_overrides(struct insn *insn, int regoff)
+{
+	if (regoff == offsetof(struct pt_regs, di) && is_string_insn(insn))
+		return false;
+
+	return true;
+}
+
+/**
+ * resolve_default_seg() - resolve default segment register index for an operand
+ * @insn:	Instruction with opcode and address size. Must be valid.
+ * @regs:	Register values as seen when entering kernel mode
+ * @off:	Operand offset, in pt_regs, for which resolution is needed
+ *
+ * Resolve the default segment register index associated with the instruction
+ * operand register indicated by @off. Such index is resolved based on defaults
+ * described in the Intel Software Development Manual.
+ *
+ * Returns:
+ *
+ * If in protected mode, a constant identifying the segment register to use,
+ * among CS, SS, ES or DS. If in long mode, INAT_SEG_REG_IGNORE.
+ *
+ * -EINVAL in case of error.
+ */
+static int resolve_default_seg(struct insn *insn, struct pt_regs *regs, int off)
+{
+	if (user_64bit_mode(regs))
+		return INAT_SEG_REG_IGNORE;
+	/*
+	 * Resolve the default segment register as described in Section 3.7.4
+	 * of the Intel Software Development Manual Vol. 1:
+	 *
+	 *  + DS for all references involving r[ABCD]X, and rSI.
+	 *  + If used in a string instruction, ES for rDI. Otherwise, DS.
+	 *  + AX, CX and DX are not valid register operands in 16-bit address
+	 *    encodings but are valid for 32-bit and 64-bit encodings.
+	 *  + -EDOM is reserved to identify for cases in which no register
+	 *    is used (i.e., displacement-only addressing). Use DS.
+	 *  + SS for rSP or rBP.
+	 *  + CS for rIP.
+	 */
+
+	switch (off) {
+	case offsetof(struct pt_regs, ax):
+	case offsetof(struct pt_regs, cx):
+	case offsetof(struct pt_regs, dx):
+		/* Need insn to verify address size. */
+		if (insn->addr_bytes == 2)
+			return -EINVAL;
+
+	case -EDOM:
+	case offsetof(struct pt_regs, bx):
+	case offsetof(struct pt_regs, si):
+		return INAT_SEG_REG_DS;
+
+	case offsetof(struct pt_regs, di):
+		if (is_string_insn(insn))
+			return INAT_SEG_REG_ES;
+		return INAT_SEG_REG_DS;
+
+	case offsetof(struct pt_regs, bp):
+	case offsetof(struct pt_regs, sp):
+		return INAT_SEG_REG_SS;
+
+	case offsetof(struct pt_regs, ip):
+		return INAT_SEG_REG_CS;
+
+	default:
+		return -EINVAL;
+	}
+}
+
+/**
+ * resolve_seg_reg() - obtain segment register index
+ * @insn:	Instruction with operands
+ * @regs:	Register values as seen when entering kernel mode
+ * @regoff:	Operand offset, in pt_regs, used to deterimine segment register
+ *
+ * Determine the segment register associated with the operands and, if
+ * applicable, prefixes and the instruction pointed by @insn.
+ *
+ * The segment register associated to an operand used in register-indirect
+ * addressing depends on:
+ *
+ * a) Whether running in long mode (in such a case segments are ignored, except
+ * if FS or GS are used).
+ *
+ * b) Whether segment override prefixes can be used. Certain instructions and
+ *    registers do not allow override prefixes.
+ *
+ * c) Whether segment overrides prefixes are found in the instruction prefixes.
+ *
+ * d) If there are not segment override prefixes or they cannot be used, the
+ *    default segment register associated with the operand register is used.
+ *
+ * The function checks first if segment override prefixes can be used with the
+ * operand indicated by @regoff. If allowed, obtain such overridden segment
+ * register index. Lastly, if not prefixes were found or cannot be used, resolve
+ * the segment register index to use based on the defaults described in the
+ * Intel documentation. In long mode, all segment register indexes will be
+ * ignored, except if overrides were found for FS or GS. All these operations
+ * are done using helper functions.
+ *
+ * The operand register, @regoff, is represented as the offset from the base of
+ * pt_regs.
+ *
+ * As stated, the main use of this function is to determine the segment register
+ * index based on the instruction, its operands and prefixes. Hence, @insn
+ * must be valid. However, if @regoff indicates rIP, we don't need to inspect
+ * @insn at all as in this case CS is used in all cases. This case is checked
+ * before proceeding further.
+ *
+ * Please note that this function does not return the value in the segment
+ * register (i.e., the segment selector) but our defined index. The segment
+ * selector needs to be obtained using get_segment_selector() and passing the
+ * segment register index resolved by this function.
+ *
+ * Returns:
+ *
+ * An index identifying the segment register to use, among CS, SS, DS,
+ * ES, FS, or GS. INAT_SEG_REG_IGNORE is returned if running in long mode.
+ *
+ * -EINVAL in case of error.
+ */
+static int resolve_seg_reg(struct insn *insn, struct pt_regs *regs, int regoff)
+{
+	int idx;
+
+	/*
+	 * In the unlikely event of having to resolve the segment register
+	 * index for rIP, do it first. Segment override prefixes should not
+	 * be used. Hence, it is not necessary to inspect the instruction,
+	 * which may be invalid at this point.
+	 */
+	if (regoff == offsetof(struct pt_regs, ip)) {
+		if (user_64bit_mode(regs))
+			return INAT_SEG_REG_IGNORE;
+		else
+			return INAT_SEG_REG_CS;
+	}
+
+	if (!insn)
+		return -EINVAL;
+
+	if (!check_seg_overrides(insn, regoff))
+		return resolve_default_seg(insn, regs, regoff);
+
+	idx = get_seg_reg_override_idx(insn);
+	if (idx < 0)
+		return idx;
+
+	if (idx == INAT_SEG_REG_DEFAULT)
+		return resolve_default_seg(insn, regs, regoff);
+
+	/*
+	 * In long mode, segment override prefixes are ignored, except for
+	 * overrides for FS and GS.
+	 */
+	if (user_64bit_mode(regs)) {
+		if (idx != INAT_SEG_REG_FS &&
+		    idx != INAT_SEG_REG_GS)
+			idx = INAT_SEG_REG_IGNORE;
+	}
+
+	return idx;
+}
+
+/**
+ * get_segment_selector() - obtain segment selector
+ * @regs:		Register values as seen when entering kernel mode
+ * @seg_reg_idx:	Segment register index to use
+ *
+ * Obtain the segment selector from any of the CS, SS, DS, ES, FS, GS segment
+ * registers. In CONFIG_X86_32, the segment is obtained from either pt_regs or
+ * kernel_vm86_regs as applicable. In CONFIG_X86_64, CS and SS are obtained
+ * from pt_regs. DS, ES, FS and GS are obtained by reading the actual CPU
+ * registers. This done for only for completeness as in CONFIG_X86_64 segment
+ * registers are ignored.
+ *
+ * Returns:
+ *
+ * Value of the segment selector, including null when running in
+ * long mode.
+ *
+ * -EINVAL on error.
+ */
+static short get_segment_selector(struct pt_regs *regs, int seg_reg_idx)
+{
+#ifdef CONFIG_X86_64
+	unsigned short sel;
+
+	switch (seg_reg_idx) {
+	case INAT_SEG_REG_IGNORE:
+		return 0;
+	case INAT_SEG_REG_CS:
+		return (unsigned short)(regs->cs & 0xffff);
+	case INAT_SEG_REG_SS:
+		return (unsigned short)(regs->ss & 0xffff);
+	case INAT_SEG_REG_DS:
+		savesegment(ds, sel);
+		return sel;
+	case INAT_SEG_REG_ES:
+		savesegment(es, sel);
+		return sel;
+	case INAT_SEG_REG_FS:
+		savesegment(fs, sel);
+		return sel;
+	case INAT_SEG_REG_GS:
+		savesegment(gs, sel);
+		return sel;
+	default:
+		return -EINVAL;
+	}
+#else /* CONFIG_X86_32 */
+	struct kernel_vm86_regs *vm86regs = (struct kernel_vm86_regs *)regs;
+
+	if (v8086_mode(regs)) {
+		switch (seg_reg_idx) {
+		case INAT_SEG_REG_CS:
+			return (unsigned short)(regs->cs & 0xffff);
+		case INAT_SEG_REG_SS:
+			return (unsigned short)(regs->ss & 0xffff);
+		case INAT_SEG_REG_DS:
+			return vm86regs->ds;
+		case INAT_SEG_REG_ES:
+			return vm86regs->es;
+		case INAT_SEG_REG_FS:
+			return vm86regs->fs;
+		case INAT_SEG_REG_GS:
+			return vm86regs->gs;
+		case INAT_SEG_REG_IGNORE:
+			/* fall through */
+		default:
+			return -EINVAL;
+		}
+	}
+
+	switch (seg_reg_idx) {
+	case INAT_SEG_REG_CS:
+		return (unsigned short)(regs->cs & 0xffff);
+	case INAT_SEG_REG_SS:
+		return (unsigned short)(regs->ss & 0xffff);
+	case INAT_SEG_REG_DS:
+		return (unsigned short)(regs->ds & 0xffff);
+	case INAT_SEG_REG_ES:
+		return (unsigned short)(regs->es & 0xffff);
+	case INAT_SEG_REG_FS:
+		return (unsigned short)(regs->fs & 0xffff);
+	case INAT_SEG_REG_GS:
+		/*
+		 * GS may or may not be in regs as per CONFIG_X86_32_LAZY_GS.
+		 * The macro below takes care of both cases.
+		 */
+		return get_user_gs(regs);
+	case INAT_SEG_REG_IGNORE:
+		/* fall through */
+	default:
+		return -EINVAL;
+	}
+#endif /* CONFIG_X86_64 */
+}
+
+static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
+			  enum reg_type type)
+{
+	int regno = 0;
+
+	static const int regoff[] = {
+		offsetof(struct pt_regs, ax),
+		offsetof(struct pt_regs, cx),
+		offsetof(struct pt_regs, dx),
+		offsetof(struct pt_regs, bx),
+		offsetof(struct pt_regs, sp),
+		offsetof(struct pt_regs, bp),
+		offsetof(struct pt_regs, si),
+		offsetof(struct pt_regs, di),
+#ifdef CONFIG_X86_64
+		offsetof(struct pt_regs, r8),
+		offsetof(struct pt_regs, r9),
+		offsetof(struct pt_regs, r10),
+		offsetof(struct pt_regs, r11),
+		offsetof(struct pt_regs, r12),
+		offsetof(struct pt_regs, r13),
+		offsetof(struct pt_regs, r14),
+		offsetof(struct pt_regs, r15),
+#endif
+	};
+	int nr_registers = ARRAY_SIZE(regoff);
+	/*
+	 * Don't possibly decode a 32-bit instructions as
+	 * reading a 64-bit-only register.
+	 */
+	if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64)
+		nr_registers -= 8;
+
+	switch (type) {
+	case REG_TYPE_RM:
+		regno = X86_MODRM_RM(insn->modrm.value);
+
+		/*
+		 * ModRM.mod == 0 and ModRM.rm == 5 means a 32-bit displacement
+		 * follows the ModRM byte.
+		 */
+		if (!X86_MODRM_MOD(insn->modrm.value) && regno == 5)
+			return -EDOM;
+
+		if (X86_REX_B(insn->rex_prefix.value))
+			regno += 8;
+		break;
+
+	case REG_TYPE_INDEX:
+		regno = X86_SIB_INDEX(insn->sib.value);
+		if (X86_REX_X(insn->rex_prefix.value))
+			regno += 8;
+
+		/*
+		 * If ModRM.mod != 3 and SIB.index = 4 the scale*index
+		 * portion of the address computation is null. This is
+		 * true only if REX.X is 0. In such a case, the SIB index
+		 * is used in the address computation.
+		 */
+		if (X86_MODRM_MOD(insn->modrm.value) != 3 && regno == 4)
+			return -EDOM;
+		break;
+
+	case REG_TYPE_BASE:
+		regno = X86_SIB_BASE(insn->sib.value);
+		/*
+		 * If ModRM.mod is 0 and SIB.base == 5, the base of the
+		 * register-indirect addressing is 0. In this case, a
+		 * 32-bit displacement follows the SIB byte.
+		 */
+		if (!X86_MODRM_MOD(insn->modrm.value) && regno == 5)
+			return -EDOM;
+
+		if (X86_REX_B(insn->rex_prefix.value))
+			regno += 8;
+		break;
+
+	default:
+		pr_err_ratelimited("invalid register type: %d\n", type);
+		return -EINVAL;
+	}
+
+	if (regno >= nr_registers) {
+		WARN_ONCE(1, "decoded an instruction with an invalid register");
+		return -EINVAL;
+	}
+	return regoff[regno];
+}
+
+/**
+ * get_reg_offset_16() - Obtain offset of register indicated by instruction
+ * @insn:	Instruction containing ModRM byte
+ * @regs:	Register values as seen when entering kernel mode
+ * @offs1:	Offset of the first operand register
+ * @offs2:	Offset of the second opeand register, if applicable
+ *
+ * Obtain the offset, in pt_regs, of the registers indicated by the ModRM byte
+ * in @insn. This function is to be used with 16-bit address encodings. The
+ * @offs1 and @offs2 will be written with the offset of the two registers
+ * indicated by the instruction. In cases where any of the registers is not
+ * referenced by the instruction, the value will be set to -EDOM.
+ *
+ * Returns:
+ *
+ * 0 on success, -EINVAL on error.
+ */
+static int get_reg_offset_16(struct insn *insn, struct pt_regs *regs,
+			     int *offs1, int *offs2)
+{
+	/*
+	 * 16-bit addressing can use one or two registers. Specifics of
+	 * encodings are given in Table 2-1. "16-Bit Addressing Forms with the
+	 * ModR/M Byte" of the Intel Software Development Manual.
+	 */
+	static const int regoff1[] = {
+		offsetof(struct pt_regs, bx),
+		offsetof(struct pt_regs, bx),
+		offsetof(struct pt_regs, bp),
+		offsetof(struct pt_regs, bp),
+		offsetof(struct pt_regs, si),
+		offsetof(struct pt_regs, di),
+		offsetof(struct pt_regs, bp),
+		offsetof(struct pt_regs, bx),
+	};
+
+	static const int regoff2[] = {
+		offsetof(struct pt_regs, si),
+		offsetof(struct pt_regs, di),
+		offsetof(struct pt_regs, si),
+		offsetof(struct pt_regs, di),
+		-EDOM,
+		-EDOM,
+		-EDOM,
+		-EDOM,
+	};
+
+	if (!offs1 || !offs2)
+		return -EINVAL;
+
+	/* Operand is a register, use the generic function. */
+	if (X86_MODRM_MOD(insn->modrm.value) == 3) {
+		*offs1 = insn_get_modrm_rm_off(insn, regs);
+		*offs2 = -EDOM;
+		return 0;
+	}
+
+	*offs1 = regoff1[X86_MODRM_RM(insn->modrm.value)];
+	*offs2 = regoff2[X86_MODRM_RM(insn->modrm.value)];
+
+	/*
+	 * If ModRM.mod is 0 and ModRM.rm is 110b, then we use displacement-
+	 * only addressing. This means that no registers are involved in
+	 * computing the effective address. Thus, ensure that the first
+	 * register offset is invalild. The second register offset is already
+	 * invalid under the aforementioned conditions.
+	 */
+	if ((X86_MODRM_MOD(insn->modrm.value) == 0) &&
+	    (X86_MODRM_RM(insn->modrm.value) == 6))
+		*offs1 = -EDOM;
+
+	return 0;
+}
+
+/**
+ * get_desc() - Obtain pointer to a segment descriptor
+ * @sel:	Segment selector
+ *
+ * Given a segment selector, obtain a pointer to the segment descriptor.
+ * Both global and local descriptor tables are supported.
+ *
+ * Returns:
+ *
+ * Pointer to segment descriptor on success.
+ *
+ * NULL on error.
+ */
+static struct desc_struct *get_desc(unsigned short sel)
+{
+	struct desc_ptr gdt_desc = {0, 0};
+	unsigned long desc_base;
+
+#ifdef CONFIG_MODIFY_LDT_SYSCALL
+	if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) {
+		struct desc_struct *desc = NULL;
+		struct ldt_struct *ldt;
+
+		/* Bits [15:3] contain the index of the desired entry. */
+		sel >>= 3;
+
+		mutex_lock(&current->active_mm->context.lock);
+		ldt = current->active_mm->context.ldt;
+		if (ldt && sel < ldt->nr_entries)
+			desc = &ldt->entries[sel];
+
+		mutex_unlock(&current->active_mm->context.lock);
+
+		return desc;
+	}
+#endif
+	native_store_gdt(&gdt_desc);
+
+	/*
+	 * Segment descriptors have a size of 8 bytes. Thus, the index is
+	 * multiplied by 8 to obtain the memory offset of the desired descriptor
+	 * from the base of the GDT. As bits [15:3] of the segment selector
+	 * contain the index, it can be regarded as multiplied by 8 already.
+	 * All that remains is to clear bits [2:0].
+	 */
+	desc_base = sel & ~(SEGMENT_RPL_MASK | SEGMENT_TI_MASK);
+
+	if (desc_base > gdt_desc.size)
+		return NULL;
+
+	return (struct desc_struct *)(gdt_desc.address + desc_base);
+}
+
+/**
+ * insn_get_seg_base() - Obtain base address of segment descriptor.
+ * @regs:		Register values as seen when entering kernel mode
+ * @seg_reg_idx:	Index of the segment register pointing to seg descriptor
+ *
+ * Obtain the base address of the segment as indicated by the segment descriptor
+ * pointed by the segment selector. The segment selector is obtained from the
+ * input segment register index @seg_reg_idx.
+ *
+ * Returns:
+ *
+ * In protected mode, base address of the segment. Zero in long mode,
+ * except when FS or GS are used. In virtual-8086 mode, the segment
+ * selector shifted 4 bits to the right.
+ *
+ * -1L in case of error.
+ */
+unsigned long insn_get_seg_base(struct pt_regs *regs, int seg_reg_idx)
+{
+	struct desc_struct *desc;
+	short sel;
+
+	sel = get_segment_selector(regs, seg_reg_idx);
+	if (sel < 0)
+		return -1L;
+
+	if (v8086_mode(regs))
+		/*
+		 * Base is simply the segment selector shifted 4
+		 * bits to the right.
+		 */
+		return (unsigned long)(sel << 4);
+
+	if (user_64bit_mode(regs)) {
+		/*
+		 * Only FS or GS will have a base address, the rest of
+		 * the segments' bases are forced to 0.
+		 */
+		unsigned long base;
+
+		if (seg_reg_idx == INAT_SEG_REG_FS)
+			rdmsrl(MSR_FS_BASE, base);
+		else if (seg_reg_idx == INAT_SEG_REG_GS)
+			/*
+			 * swapgs was called at the kernel entry point. Thus,
+			 * MSR_KERNEL_GS_BASE will have the user-space GS base.
+			 */
+			rdmsrl(MSR_KERNEL_GS_BASE, base);
+		else
+			base = 0;
+		return base;
+	}
+
+	/* In protected mode the segment selector cannot be null. */
+	if (!sel)
+		return -1L;
+
+	desc = get_desc(sel);
+	if (!desc)
+		return -1L;
+
+	return get_desc_base(desc);
+}
+
+/**
+ * get_seg_limit() - Obtain the limit of a segment descriptor
+ * @regs:		Register values as seen when entering kernel mode
+ * @seg_reg_idx:	Index of the segment register pointing to seg descriptor
+ *
+ * Obtain the limit of the segment as indicated by the segment descriptor
+ * pointed by the segment selector. The segment selector is obtained from the
+ * input segment register index @seg_reg_idx.
+ *
+ * Returns:
+ *
+ * In protected mode, the limit of the segment descriptor in bytes.
+ * In long mode and virtual-8086 mode, segment limits are not enforced. Thus,
+ * limit is returned as -1L to imply a limit-less segment.
+ *
+ * Zero is returned on error.
+ */
+static unsigned long get_seg_limit(struct pt_regs *regs, int seg_reg_idx)
+{
+	struct desc_struct *desc;
+	unsigned long limit;
+	short sel;
+
+	sel = get_segment_selector(regs, seg_reg_idx);
+	if (sel < 0)
+		return 0;
+
+	if (user_64bit_mode(regs) || v8086_mode(regs))
+		return -1L;
+
+	if (!sel)
+		return 0;
+
+	desc = get_desc(sel);
+	if (!desc)
+		return 0;
+
+	/*
+	 * If the granularity bit is set, the limit is given in multiples
+	 * of 4096. This also means that the 12 least significant bits are
+	 * not tested when checking the segment limits. In practice,
+	 * this means that the segment ends in (limit << 12) + 0xfff.
+	 */
+	limit = get_desc_limit(desc);
+	if (desc->g)
+		limit = (limit << 12) + 0xfff;
+
+	return limit;
+}
+
+/**
+ * insn_get_code_seg_params() - Obtain code segment parameters
+ * @regs:	Structure with register values as seen when entering kernel mode
+ *
+ * Obtain address and operand sizes of the code segment. It is obtained from the
+ * selector contained in the CS register in regs. In protected mode, the default
+ * address is determined by inspecting the L and D bits of the segment
+ * descriptor. In virtual-8086 mode, the default is always two bytes for both
+ * address and operand sizes.
+ *
+ * Returns:
+ *
+ * A signed 8-bit value containing the default parameters on success.
+ *
+ * -EINVAL on error.
+ */
+char insn_get_code_seg_params(struct pt_regs *regs)
+{
+	struct desc_struct *desc;
+	short sel;
+
+	if (v8086_mode(regs))
+		/* Address and operand size are both 16-bit. */
+		return INSN_CODE_SEG_PARAMS(2, 2);
+
+	sel = get_segment_selector(regs, INAT_SEG_REG_CS);
+	if (sel < 0)
+		return sel;
+
+	desc = get_desc(sel);
+	if (!desc)
+		return -EINVAL;
+
+	/*
+	 * The most significant byte of the Type field of the segment descriptor
+	 * determines whether a segment contains data or code. If this is a data
+	 * segment, return error.
+	 */
+	if (!(desc->type & BIT(3)))
+		return -EINVAL;
+
+	switch ((desc->l << 1) | desc->d) {
+	case 0: /*
+		 * Legacy mode. CS.L=0, CS.D=0. Address and operand size are
+		 * both 16-bit.
+		 */
+		return INSN_CODE_SEG_PARAMS(2, 2);
+	case 1: /*
+		 * Legacy mode. CS.L=0, CS.D=1. Address and operand size are
+		 * both 32-bit.
+		 */
+		return INSN_CODE_SEG_PARAMS(4, 4);
+	case 2: /*
+		 * IA-32e 64-bit mode. CS.L=1, CS.D=0. Address size is 64-bit;
+		 * operand size is 32-bit.
+		 */
+		return INSN_CODE_SEG_PARAMS(4, 8);
+	case 3: /* Invalid setting. CS.L=1, CS.D=1 */
+		/* fall through */
+	default:
+		return -EINVAL;
+	}
+}
+
+/**
+ * insn_get_modrm_rm_off() - Obtain register in r/m part of the ModRM byte
+ * @insn:	Instruction containing the ModRM byte
+ * @regs:	Register values as seen when entering kernel mode
+ *
+ * Returns:
+ *
+ * The register indicated by the r/m part of the ModRM byte. The
+ * register is obtained as an offset from the base of pt_regs. In specific
+ * cases, the returned value can be -EDOM to indicate that the particular value
+ * of ModRM does not refer to a register and shall be ignored.
+ */
+int insn_get_modrm_rm_off(struct insn *insn, struct pt_regs *regs)
+{
+	return get_reg_offset(insn, regs, REG_TYPE_RM);
+}
+
+/**
+ * get_seg_base_limit() - obtain base address and limit of a segment
+ * @insn:	Instruction. Must be valid.
+ * @regs:	Register values as seen when entering kernel mode
+ * @regoff:	Operand offset, in pt_regs, used to resolve segment descriptor
+ * @base:	Obtained segment base
+ * @limit:	Obtained segment limit
+ *
+ * Obtain the base address and limit of the segment associated with the operand
+ * @regoff and, if any or allowed, override prefixes in @insn. This function is
+ * different from insn_get_seg_base() as the latter does not resolve the segment
+ * associated with the instruction operand. If a limit is not needed (e.g.,
+ * when running in long mode), @limit can be NULL.
+ *
+ * Returns:
+ *
+ * 0 on success. @base and @limit will contain the base address and of the
+ * resolved segment, respectively.
+ *
+ * -EINVAL on error.
+ */
+static int get_seg_base_limit(struct insn *insn, struct pt_regs *regs,
+			      int regoff, unsigned long *base,
+			      unsigned long *limit)
+{
+	int seg_reg_idx;
+
+	if (!base)
+		return -EINVAL;
+
+	seg_reg_idx = resolve_seg_reg(insn, regs, regoff);
+	if (seg_reg_idx < 0)
+		return seg_reg_idx;
+
+	*base = insn_get_seg_base(regs, seg_reg_idx);
+	if (*base == -1L)
+		return -EINVAL;
+
+	if (!limit)
+		return 0;
+
+	*limit = get_seg_limit(regs, seg_reg_idx);
+	if (!(*limit))
+		return -EINVAL;
+
+	return 0;
+}
+
+/**
+ * get_eff_addr_reg() - Obtain effective address from register operand
+ * @insn:	Instruction. Must be valid.
+ * @regs:	Register values as seen when entering kernel mode
+ * @regoff:	Obtained operand offset, in pt_regs, with the effective address
+ * @eff_addr:	Obtained effective address
+ *
+ * Obtain the effective address stored in the register operand as indicated by
+ * the ModRM byte. This function is to be used only with register addressing
+ * (i.e.,  ModRM.mod is 3). The effective address is saved in @eff_addr. The
+ * register operand, as an offset from the base of pt_regs, is saved in @regoff;
+ * such offset can then be used to resolve the segment associated with the
+ * operand. This function can be used with any of the supported address sizes
+ * in x86.
+ *
+ * Returns:
+ *
+ * 0 on success. @eff_addr will have the effective address stored in the
+ * operand indicated by ModRM. @regoff will have such operand as an offset from
+ * the base of pt_regs.
+ *
+ * -EINVAL on error.
+ */
+static int get_eff_addr_reg(struct insn *insn, struct pt_regs *regs,
+			    int *regoff, long *eff_addr)
+{
+	insn_get_modrm(insn);
+
+	if (!insn->modrm.nbytes)
+		return -EINVAL;
+
+	if (X86_MODRM_MOD(insn->modrm.value) != 3)
+		return -EINVAL;
+
+	*regoff = get_reg_offset(insn, regs, REG_TYPE_RM);
+	if (*regoff < 0)
+		return -EINVAL;
+
+	/* Ignore bytes that are outside the address size. */
+	if (insn->addr_bytes == 2)
+		*eff_addr = regs_get_register(regs, *regoff) & 0xffff;
+	else if (insn->addr_bytes == 4)
+		*eff_addr = regs_get_register(regs, *regoff) & 0xffffffff;
+	else /* 64-bit address */
+		*eff_addr = regs_get_register(regs, *regoff);
+
+	return 0;
+}
+
+/**
+ * get_eff_addr_modrm() - Obtain referenced effective address via ModRM
+ * @insn:	Instruction. Must be valid.
+ * @regs:	Register values as seen when entering kernel mode
+ * @regoff:	Obtained operand offset, in pt_regs, associated with segment
+ * @eff_addr:	Obtained effective address
+ *
+ * Obtain the effective address referenced by the ModRM byte of @insn. After
+ * identifying the registers involved in the register-indirect memory reference,
+ * its value is obtained from the operands in @regs. The computed address is
+ * stored @eff_addr. Also, the register operand that indicates the associated
+ * segment is stored in @regoff, this parameter can later be used to determine
+ * such segment.
+ *
+ * Returns:
+ *
+ * 0 on success. @eff_addr will have the referenced effective address. @regoff
+ * will have a register, as an offset from the base of pt_regs, that can be used
+ * to resolve the associated segment.
+ *
+ * -EINVAL on error.
+ */
+static int get_eff_addr_modrm(struct insn *insn, struct pt_regs *regs,
+			      int *regoff, long *eff_addr)
+{
+	long tmp;
+
+	if (insn->addr_bytes != 8 && insn->addr_bytes != 4)
+		return -EINVAL;
+
+	insn_get_modrm(insn);
+
+	if (!insn->modrm.nbytes)
+		return -EINVAL;
+
+	if (X86_MODRM_MOD(insn->modrm.value) > 2)
+		return -EINVAL;
+
+	*regoff = get_reg_offset(insn, regs, REG_TYPE_RM);
+
+	/*
+	 * -EDOM means that we must ignore the address_offset. In such a case,
+	 * in 64-bit mode the effective address relative to the rIP of the
+	 * following instruction.
+	 */
+	if (*regoff == -EDOM) {
+		if (user_64bit_mode(regs))
+			tmp = regs->ip + insn->length;
+		else
+			tmp = 0;
+	} else if (*regoff < 0) {
+		return -EINVAL;
+	} else {
+		tmp = regs_get_register(regs, *regoff);
+	}
+
+	if (insn->addr_bytes == 4) {
+		int addr32 = (int)(tmp & 0xffffffff) + insn->displacement.value;
+
+		*eff_addr = addr32 & 0xffffffff;
+	} else {
+		*eff_addr = tmp + insn->displacement.value;
+	}
+
+	return 0;
+}
+
+/**
+ * get_eff_addr_modrm_16() - Obtain referenced effective address via ModRM
+ * @insn:	Instruction. Must be valid.
+ * @regs:	Register values as seen when entering kernel mode
+ * @regoff:	Obtained operand offset, in pt_regs, associated with segment
+ * @eff_addr:	Obtained effective address
+ *
+ * Obtain the 16-bit effective address referenced by the ModRM byte of @insn.
+ * After identifying the registers involved in the register-indirect memory
+ * reference, its value is obtained from the operands in @regs. The computed
+ * address is stored @eff_addr. Also, the register operand that indicates
+ * the associated segment is stored in @regoff, this parameter can later be used
+ * to determine such segment.
+ *
+ * Returns:
+ *
+ * 0 on success. @eff_addr will have the referenced effective address. @regoff
+ * will have a register, as an offset from the base of pt_regs, that can be used
+ * to resolve the associated segment.
+ *
+ * -EINVAL on error.
+ */
+static int get_eff_addr_modrm_16(struct insn *insn, struct pt_regs *regs,
+				 int *regoff, short *eff_addr)
+{
+	int addr_offset1, addr_offset2, ret;
+	short addr1 = 0, addr2 = 0, displacement;
+
+	if (insn->addr_bytes != 2)
+		return -EINVAL;
+
+	insn_get_modrm(insn);
+
+	if (!insn->modrm.nbytes)
+		return -EINVAL;
+
+	if (X86_MODRM_MOD(insn->modrm.value) > 2)
+		return -EINVAL;
+
+	ret = get_reg_offset_16(insn, regs, &addr_offset1, &addr_offset2);
+	if (ret < 0)
+		return -EINVAL;
+
+	/*
+	 * Don't fail on invalid offset values. They might be invalid because
+	 * they cannot be used for this particular value of ModRM. Instead, use
+	 * them in the computation only if they contain a valid value.
+	 */
+	if (addr_offset1 != -EDOM)
+		addr1 = regs_get_register(regs, addr_offset1) & 0xffff;
+
+	if (addr_offset2 != -EDOM)
+		addr2 = regs_get_register(regs, addr_offset2) & 0xffff;
+
+	displacement = insn->displacement.value & 0xffff;
+	*eff_addr = addr1 + addr2 + displacement;
+
+	/*
+	 * The first operand register could indicate to use of either SS or DS
+	 * registers to obtain the segment selector.  The second operand
+	 * register can only indicate the use of DS. Thus, the first operand
+	 * will be used to obtain the segment selector.
+	 */
+	*regoff = addr_offset1;
+
+	return 0;
+}
+
+/**
+ * get_eff_addr_sib() - Obtain referenced effective address via SIB
+ * @insn:	Instruction. Must be valid.
+ * @regs:	Register values as seen when entering kernel mode
+ * @regoff:	Obtained operand offset, in pt_regs, associated with segment
+ * @eff_addr:	Obtained effective address
+ *
+ * Obtain the effective address referenced by the SIB byte of @insn. After
+ * identifying the registers involved in the indexed, register-indirect memory
+ * reference, its value is obtained from the operands in @regs. The computed
+ * address is stored @eff_addr. Also, the register operand that indicates the
+ * associated segment is stored in @regoff, this parameter can later be used to
+ * determine such segment.
+ *
+ * Returns:
+ *
+ * 0 on success. @eff_addr will have the referenced effective address.
+ * @base_offset will have a register, as an offset from the base of pt_regs,
+ * that can be used to resolve the associated segment.
+ *
+ * -EINVAL on error.
+ */
+static int get_eff_addr_sib(struct insn *insn, struct pt_regs *regs,
+			    int *base_offset, long *eff_addr)
+{
+	long base, indx;
+	int indx_offset;
+
+	if (insn->addr_bytes != 8 && insn->addr_bytes != 4)
+		return -EINVAL;
+
+	insn_get_modrm(insn);
+
+	if (!insn->modrm.nbytes)
+		return -EINVAL;
+
+	if (X86_MODRM_MOD(insn->modrm.value) > 2)
+		return -EINVAL;
+
+	insn_get_sib(insn);
+
+	if (!insn->sib.nbytes)
+		return -EINVAL;
+
+	*base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE);
+	indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX);
+
+	/*
+	 * Negative values in the base and index offset means an error when
+	 * decoding the SIB byte. Except -EDOM, which means that the registers
+	 * should not be used in the address computation.
+	 */
+	if (*base_offset == -EDOM)
+		base = 0;
+	else if (*base_offset < 0)
+		return -EINVAL;
+	else
+		base = regs_get_register(regs, *base_offset);
+
+	if (indx_offset == -EDOM)
+		indx = 0;
+	else if (indx_offset < 0)
+		return -EINVAL;
+	else
+		indx = regs_get_register(regs, indx_offset);
+
+	if (insn->addr_bytes == 4) {
+		int addr32, base32, idx32;
+
+		base32 = base & 0xffffffff;
+		idx32 = indx & 0xffffffff;
+
+		addr32 = base32 + idx32 * (1 << X86_SIB_SCALE(insn->sib.value));
+		addr32 += insn->displacement.value;
+
+		*eff_addr = addr32 & 0xffffffff;
+	} else {
+		*eff_addr = base + indx * (1 << X86_SIB_SCALE(insn->sib.value));
+		*eff_addr += insn->displacement.value;
+	}
+
+	return 0;
+}
+
+/**
+ * get_addr_ref_16() - Obtain the 16-bit address referred by instruction
+ * @insn:	Instruction containing ModRM byte and displacement
+ * @regs:	Register values as seen when entering kernel mode
+ *
+ * This function is to be used with 16-bit address encodings. Obtain the memory
+ * address referred by the instruction's ModRM and displacement bytes. Also, the
+ * segment used as base is determined by either any segment override prefixes in
+ * @insn or the default segment of the registers involved in the address
+ * computation. In protected mode, segment limits are enforced.
+ *
+ * Returns:
+ *
+ * Linear address referenced by the instruction operands on success.
+ *
+ * -1L on error.
+ */
+static void __user *get_addr_ref_16(struct insn *insn, struct pt_regs *regs)
+{
+	unsigned long linear_addr = -1L, seg_base, seg_limit;
+	int ret, regoff;
+	short eff_addr;
+	long tmp;
+
+	insn_get_modrm(insn);
+	insn_get_displacement(insn);
+
+	if (insn->addr_bytes != 2)
+		goto out;
+
+	if (X86_MODRM_MOD(insn->modrm.value) == 3) {
+		ret = get_eff_addr_reg(insn, regs, &regoff, &tmp);
+		if (ret)
+			goto out;
+
+		eff_addr = tmp;
+	} else {
+		ret = get_eff_addr_modrm_16(insn, regs, &regoff, &eff_addr);
+		if (ret)
+			goto out;
+	}
+
+	ret = get_seg_base_limit(insn, regs, regoff, &seg_base, &seg_limit);
+	if (ret)
+		goto out;
+
+	/*
+	 * Before computing the linear address, make sure the effective address
+	 * is within the limits of the segment. In virtual-8086 mode, segment
+	 * limits are not enforced. In such a case, the segment limit is -1L to
+	 * reflect this fact.
+	 */
+	if ((unsigned long)(eff_addr & 0xffff) > seg_limit)
+		goto out;
+
+	linear_addr = (unsigned long)(eff_addr & 0xffff) + seg_base;
+
+	/* Limit linear address to 20 bits */
+	if (v8086_mode(regs))
+		linear_addr &= 0xfffff;
+
+out:
+	return (void __user *)linear_addr;
+}
+
+/**
+ * get_addr_ref_32() - Obtain a 32-bit linear address
+ * @insn:	Instruction with ModRM, SIB bytes and displacement
+ * @regs:	Register values as seen when entering kernel mode
+ *
+ * This function is to be used with 32-bit address encodings to obtain the
+ * linear memory address referred by the instruction's ModRM, SIB,
+ * displacement bytes and segment base address, as applicable. If in protected
+ * mode, segment limits are enforced.
+ *
+ * Returns:
+ *
+ * Linear address referenced by instruction and registers on success.
+ *
+ * -1L on error.
+ */
+static void __user *get_addr_ref_32(struct insn *insn, struct pt_regs *regs)
+{
+	unsigned long linear_addr = -1L, seg_base, seg_limit;
+	int eff_addr, regoff;
+	long tmp;
+	int ret;
+
+	if (insn->addr_bytes != 4)
+		goto out;
+
+	if (X86_MODRM_MOD(insn->modrm.value) == 3) {
+		ret = get_eff_addr_reg(insn, regs, &regoff, &tmp);
+		if (ret)
+			goto out;
+
+		eff_addr = tmp;
+
+	} else {
+		if (insn->sib.nbytes) {
+			ret = get_eff_addr_sib(insn, regs, &regoff, &tmp);
+			if (ret)
+				goto out;
+
+			eff_addr = tmp;
+		} else {
+			ret = get_eff_addr_modrm(insn, regs, &regoff, &tmp);
+			if (ret)
+				goto out;
+
+			eff_addr = tmp;
+		}
+	}
+
+	ret = get_seg_base_limit(insn, regs, regoff, &seg_base, &seg_limit);
+	if (ret)
+		goto out;
+
+	/*
+	 * In protected mode, before computing the linear address, make sure
+	 * the effective address is within the limits of the segment.
+	 * 32-bit addresses can be used in long and virtual-8086 modes if an
+	 * address override prefix is used. In such cases, segment limits are
+	 * not enforced. When in virtual-8086 mode, the segment limit is -1L
+	 * to reflect this situation.
+	 *
+	 * After computed, the effective address is treated as an unsigned
+	 * quantity.
+	 */
+	if (!user_64bit_mode(regs) && ((unsigned int)eff_addr > seg_limit))
+		goto out;
+
+	/*
+	 * Even though 32-bit address encodings are allowed in virtual-8086
+	 * mode, the address range is still limited to [0x-0xffff].
+	 */
+	if (v8086_mode(regs) && (eff_addr & ~0xffff))
+		goto out;
+
+	/*
+	 * Data type long could be 64 bits in size. Ensure that our 32-bit
+	 * effective address is not sign-extended when computing the linear
+	 * address.
+	 */
+	linear_addr = (unsigned long)(eff_addr & 0xffffffff) + seg_base;
+
+	/* Limit linear address to 20 bits */
+	if (v8086_mode(regs))
+		linear_addr &= 0xfffff;
+
+out:
+	return (void __user *)linear_addr;
+}
+
+/**
+ * get_addr_ref_64() - Obtain a 64-bit linear address
+ * @insn:	Instruction struct with ModRM and SIB bytes and displacement
+ * @regs:	Structure with register values as seen when entering kernel mode
+ *
+ * This function is to be used with 64-bit address encodings to obtain the
+ * linear memory address referred by the instruction's ModRM, SIB,
+ * displacement bytes and segment base address, as applicable.
+ *
+ * Returns:
+ *
+ * Linear address referenced by instruction and registers on success.
+ *
+ * -1L on error.
+ */
+#ifndef CONFIG_X86_64
+static void __user *get_addr_ref_64(struct insn *insn, struct pt_regs *regs)
+{
+	return (void __user *)-1L;
+}
+#else
+static void __user *get_addr_ref_64(struct insn *insn, struct pt_regs *regs)
+{
+	unsigned long linear_addr = -1L, seg_base;
+	int regoff, ret;
+	long eff_addr;
+
+	if (insn->addr_bytes != 8)
+		goto out;
+
+	if (X86_MODRM_MOD(insn->modrm.value) == 3) {
+		ret = get_eff_addr_reg(insn, regs, &regoff, &eff_addr);
+		if (ret)
+			goto out;
+
+	} else {
+		if (insn->sib.nbytes) {
+			ret = get_eff_addr_sib(insn, regs, &regoff, &eff_addr);
+			if (ret)
+				goto out;
+		} else {
+			ret = get_eff_addr_modrm(insn, regs, &regoff, &eff_addr);
+			if (ret)
+				goto out;
+		}
+
+	}
+
+	ret = get_seg_base_limit(insn, regs, regoff, &seg_base, NULL);
+	if (ret)
+		goto out;
+
+	linear_addr = (unsigned long)eff_addr + seg_base;
+
+out:
+	return (void __user *)linear_addr;
+}
+#endif /* CONFIG_X86_64 */
+
+/**
+ * insn_get_addr_ref() - Obtain the linear address referred by instruction
+ * @insn:	Instruction structure containing ModRM byte and displacement
+ * @regs:	Structure with register values as seen when entering kernel mode
+ *
+ * Obtain the linear address referred by the instruction's ModRM, SIB and
+ * displacement bytes, and segment base, as applicable. In protected mode,
+ * segment limits are enforced.
+ *
+ * Returns:
+ *
+ * Linear address referenced by instruction and registers on success.
+ *
+ * -1L on error.
+ */
+void __user *insn_get_addr_ref(struct insn *insn, struct pt_regs *regs)
+{
+	if (!insn || !regs)
+		return (void __user *)-1L;
+
+	switch (insn->addr_bytes) {
+	case 2:
+		return get_addr_ref_16(insn, regs);
+	case 4:
+		return get_addr_ref_32(insn, regs);
+	case 8:
+		return get_addr_ref_64(insn, regs);
+	default:
+		return (void __user *)-1L;
+	}
+}
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index b0ff378650a9..3109ba6c6ede 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -30,26 +30,6 @@
 #include <asm/trace/exceptions.h>
 
 /*
- * Page fault error code bits:
- *
- *   bit 0 ==	 0: no page found	1: protection fault
- *   bit 1 ==	 0: read access		1: write access
- *   bit 2 ==	 0: kernel-mode access	1: user-mode access
- *   bit 3 ==				1: use of reserved bit detected
- *   bit 4 ==				1: fault was an instruction fetch
- *   bit 5 ==				1: protection keys block access
- */
-enum x86_pf_error_code {
-
-	PF_PROT		=		1 << 0,
-	PF_WRITE	=		1 << 1,
-	PF_USER		=		1 << 2,
-	PF_RSVD		=		1 << 3,
-	PF_INSTR	=		1 << 4,
-	PF_PK		=		1 << 5,
-};
-
-/*
  * Returns 0 if mmiotrace is disabled, or if the fault is not
  * handled by mmiotrace:
  */
@@ -150,7 +130,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
 	 * If it was a exec (instruction fetch) fault on NX page, then
 	 * do not ignore the fault:
 	 */
-	if (error_code & PF_INSTR)
+	if (error_code & X86_PF_INSTR)
 		return 0;
 
 	instr = (void *)convert_ip_to_linear(current, regs);
@@ -180,7 +160,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
  * siginfo so userspace can discover which protection key was set
  * on the PTE.
  *
- * If we get here, we know that the hardware signaled a PF_PK
+ * If we get here, we know that the hardware signaled a X86_PF_PK
  * fault and that there was a VMA once we got in the fault
  * handler.  It does *not* guarantee that the VMA we find here
  * was the one that we faulted on.
@@ -205,7 +185,7 @@ static void fill_sig_info_pkey(int si_code, siginfo_t *info, u32 *pkey)
 	/*
 	 * force_sig_info_fault() is called from a number of
 	 * contexts, some of which have a VMA and some of which
-	 * do not.  The PF_PK handing happens after we have a
+	 * do not.  The X86_PF_PK handing happens after we have a
 	 * valid VMA, so we should never reach this without a
 	 * valid VMA.
 	 */
@@ -698,7 +678,7 @@ show_fault_oops(struct pt_regs *regs, unsigned long error_code,
 	if (!oops_may_print())
 		return;
 
-	if (error_code & PF_INSTR) {
+	if (error_code & X86_PF_INSTR) {
 		unsigned int level;
 		pgd_t *pgd;
 		pte_t *pte;
@@ -780,7 +760,7 @@ no_context(struct pt_regs *regs, unsigned long error_code,
 		 */
 		if (current->thread.sig_on_uaccess_err && signal) {
 			tsk->thread.trap_nr = X86_TRAP_PF;
-			tsk->thread.error_code = error_code | PF_USER;
+			tsk->thread.error_code = error_code | X86_PF_USER;
 			tsk->thread.cr2 = address;
 
 			/* XXX: hwpoison faults will set the wrong code. */
@@ -898,7 +878,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 	struct task_struct *tsk = current;
 
 	/* User mode accesses just cause a SIGSEGV */
-	if (error_code & PF_USER) {
+	if (error_code & X86_PF_USER) {
 		/*
 		 * It's possible to have interrupts off here:
 		 */
@@ -919,7 +899,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 		 * Instruction fetch faults in the vsyscall page might need
 		 * emulation.
 		 */
-		if (unlikely((error_code & PF_INSTR) &&
+		if (unlikely((error_code & X86_PF_INSTR) &&
 			     ((address & ~0xfff) == VSYSCALL_ADDR))) {
 			if (emulate_vsyscall(regs, address))
 				return;
@@ -932,7 +912,7 @@ __bad_area_nosemaphore(struct pt_regs *regs, unsigned long error_code,
 		 * are always protection faults.
 		 */
 		if (address >= TASK_SIZE_MAX)
-			error_code |= PF_PROT;
+			error_code |= X86_PF_PROT;
 
 		if (likely(show_unhandled_signals))
 			show_signal_msg(regs, error_code, address, tsk);
@@ -993,11 +973,11 @@ static inline bool bad_area_access_from_pkeys(unsigned long error_code,
 
 	if (!boot_cpu_has(X86_FEATURE_OSPKE))
 		return false;
-	if (error_code & PF_PK)
+	if (error_code & X86_PF_PK)
 		return true;
 	/* this checks permission keys on the VMA: */
-	if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
-				(error_code & PF_INSTR), foreign))
+	if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
+				       (error_code & X86_PF_INSTR), foreign))
 		return true;
 	return false;
 }
@@ -1025,7 +1005,7 @@ do_sigbus(struct pt_regs *regs, unsigned long error_code, unsigned long address,
 	int code = BUS_ADRERR;
 
 	/* Kernel mode? Handle exceptions or die: */
-	if (!(error_code & PF_USER)) {
+	if (!(error_code & X86_PF_USER)) {
 		no_context(regs, error_code, address, SIGBUS, BUS_ADRERR);
 		return;
 	}
@@ -1053,14 +1033,14 @@ static noinline void
 mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 	       unsigned long address, u32 *pkey, unsigned int fault)
 {
-	if (fatal_signal_pending(current) && !(error_code & PF_USER)) {
+	if (fatal_signal_pending(current) && !(error_code & X86_PF_USER)) {
 		no_context(regs, error_code, address, 0, 0);
 		return;
 	}
 
 	if (fault & VM_FAULT_OOM) {
 		/* Kernel mode? Handle exceptions or die: */
-		if (!(error_code & PF_USER)) {
+		if (!(error_code & X86_PF_USER)) {
 			no_context(regs, error_code, address,
 				   SIGSEGV, SEGV_MAPERR);
 			return;
@@ -1085,16 +1065,16 @@ mm_fault_error(struct pt_regs *regs, unsigned long error_code,
 
 static int spurious_fault_check(unsigned long error_code, pte_t *pte)
 {
-	if ((error_code & PF_WRITE) && !pte_write(*pte))
+	if ((error_code & X86_PF_WRITE) && !pte_write(*pte))
 		return 0;
 
-	if ((error_code & PF_INSTR) && !pte_exec(*pte))
+	if ((error_code & X86_PF_INSTR) && !pte_exec(*pte))
 		return 0;
 	/*
 	 * Note: We do not do lazy flushing on protection key
-	 * changes, so no spurious fault will ever set PF_PK.
+	 * changes, so no spurious fault will ever set X86_PF_PK.
 	 */
-	if ((error_code & PF_PK))
+	if ((error_code & X86_PF_PK))
 		return 1;
 
 	return 1;
@@ -1140,8 +1120,8 @@ spurious_fault(unsigned long error_code, unsigned long address)
 	 * change, so user accesses are not expected to cause spurious
 	 * faults.
 	 */
-	if (error_code != (PF_WRITE | PF_PROT)
-	    && error_code != (PF_INSTR | PF_PROT))
+	if (error_code != (X86_PF_WRITE | X86_PF_PROT) &&
+	    error_code != (X86_PF_INSTR | X86_PF_PROT))
 		return 0;
 
 	pgd = init_mm.pgd + pgd_index(address);
@@ -1201,19 +1181,19 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
 	 * always an unconditional error and can never result in
 	 * a follow-up action to resolve the fault, like a COW.
 	 */
-	if (error_code & PF_PK)
+	if (error_code & X86_PF_PK)
 		return 1;
 
 	/*
 	 * Make sure to check the VMA so that we do not perform
-	 * faults just to hit a PF_PK as soon as we fill in a
+	 * faults just to hit a X86_PF_PK as soon as we fill in a
 	 * page.
 	 */
-	if (!arch_vma_access_permitted(vma, (error_code & PF_WRITE),
-				(error_code & PF_INSTR), foreign))
+	if (!arch_vma_access_permitted(vma, (error_code & X86_PF_WRITE),
+				       (error_code & X86_PF_INSTR), foreign))
 		return 1;
 
-	if (error_code & PF_WRITE) {
+	if (error_code & X86_PF_WRITE) {
 		/* write, present and write, not present: */
 		if (unlikely(!(vma->vm_flags & VM_WRITE)))
 			return 1;
@@ -1221,7 +1201,7 @@ access_error(unsigned long error_code, struct vm_area_struct *vma)
 	}
 
 	/* read, present: */
-	if (unlikely(error_code & PF_PROT))
+	if (unlikely(error_code & X86_PF_PROT))
 		return 1;
 
 	/* read, not present: */
@@ -1244,7 +1224,7 @@ static inline bool smap_violation(int error_code, struct pt_regs *regs)
 	if (!static_cpu_has(X86_FEATURE_SMAP))
 		return false;
 
-	if (error_code & PF_USER)
+	if (error_code & X86_PF_USER)
 		return false;
 
 	if (!user_mode(regs) && (regs->flags & X86_EFLAGS_AC))
@@ -1297,7 +1277,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * protection error (error_code & 9) == 0.
 	 */
 	if (unlikely(fault_in_kernel_space(address))) {
-		if (!(error_code & (PF_RSVD | PF_USER | PF_PROT))) {
+		if (!(error_code & (X86_PF_RSVD | X86_PF_USER | X86_PF_PROT))) {
 			if (vmalloc_fault(address) >= 0)
 				return;
 
@@ -1325,7 +1305,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	if (unlikely(kprobes_fault(regs)))
 		return;
 
-	if (unlikely(error_code & PF_RSVD))
+	if (unlikely(error_code & X86_PF_RSVD))
 		pgtable_bad(regs, error_code, address);
 
 	if (unlikely(smap_violation(error_code, regs))) {
@@ -1351,7 +1331,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 */
 	if (user_mode(regs)) {
 		local_irq_enable();
-		error_code |= PF_USER;
+		error_code |= X86_PF_USER;
 		flags |= FAULT_FLAG_USER;
 	} else {
 		if (regs->flags & X86_EFLAGS_IF)
@@ -1360,9 +1340,9 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 
 	perf_sw_event(PERF_COUNT_SW_PAGE_FAULTS, 1, regs, address);
 
-	if (error_code & PF_WRITE)
+	if (error_code & X86_PF_WRITE)
 		flags |= FAULT_FLAG_WRITE;
-	if (error_code & PF_INSTR)
+	if (error_code & X86_PF_INSTR)
 		flags |= FAULT_FLAG_INSTRUCTION;
 
 	/*
@@ -1382,7 +1362,7 @@ __do_page_fault(struct pt_regs *regs, unsigned long error_code,
 	 * space check, thus avoiding the deadlock:
 	 */
 	if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
-		if ((error_code & PF_USER) == 0 &&
+		if (!(error_code & X86_PF_USER) &&
 		    !search_exception_tables(regs->ip)) {
 			bad_area_nosemaphore(regs, error_code, address, NULL);
 			return;
@@ -1409,7 +1389,7 @@ retry:
 		bad_area(regs, error_code, address);
 		return;
 	}
-	if (error_code & PF_USER) {
+	if (error_code & X86_PF_USER) {
 		/*
 		 * Accessing the stack below %sp is always a bug.
 		 * The large cushion allows instructions like enter
diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c
index 048fbe8fc274..adcea90a2046 100644
--- a/arch/x86/mm/init_64.c
+++ b/arch/x86/mm/init_64.c
@@ -1426,16 +1426,16 @@ int __meminit vmemmap_populate(unsigned long start, unsigned long end, int node)
 
 #if defined(CONFIG_MEMORY_HOTPLUG_SPARSE) && defined(CONFIG_HAVE_BOOTMEM_INFO_NODE)
 void register_page_bootmem_memmap(unsigned long section_nr,
-				  struct page *start_page, unsigned long size)
+				  struct page *start_page, unsigned long nr_pages)
 {
 	unsigned long addr = (unsigned long)start_page;
-	unsigned long end = (unsigned long)(start_page + size);
+	unsigned long end = (unsigned long)(start_page + nr_pages);
 	unsigned long next;
 	pgd_t *pgd;
 	p4d_t *p4d;
 	pud_t *pud;
 	pmd_t *pmd;
-	unsigned int nr_pages;
+	unsigned int nr_pmd_pages;
 	struct page *page;
 
 	for (; addr < end; addr = next) {
@@ -1482,9 +1482,9 @@ void register_page_bootmem_memmap(unsigned long section_nr,
 			if (pmd_none(*pmd))
 				continue;
 
-			nr_pages = 1 << (get_order(PMD_SIZE));
+			nr_pmd_pages = 1 << get_order(PMD_SIZE);
 			page = pmd_page(*pmd);
-			while (nr_pages--)
+			while (nr_pmd_pages--)
 				get_page_bootmem(section_nr, page++,
 						 SECTION_INFO);
 		}
diff --git a/arch/x86/mm/ioremap.c b/arch/x86/mm/ioremap.c
index 34f0e1847dd6..6e4573b1da34 100644
--- a/arch/x86/mm/ioremap.c
+++ b/arch/x86/mm/ioremap.c
@@ -27,6 +27,11 @@
 
 #include "physaddr.h"
 
+struct ioremap_mem_flags {
+	bool system_ram;
+	bool desc_other;
+};
+
 /*
  * Fix up the linear direct mapping of the kernel to avoid cache attribute
  * conflicts.
@@ -56,17 +61,59 @@ int ioremap_change_attr(unsigned long vaddr, unsigned long size,
 	return err;
 }
 
-static int __ioremap_check_ram(unsigned long start_pfn, unsigned long nr_pages,
-			       void *arg)
+static bool __ioremap_check_ram(struct resource *res)
 {
+	unsigned long start_pfn, stop_pfn;
 	unsigned long i;
 
-	for (i = 0; i < nr_pages; ++i)
-		if (pfn_valid(start_pfn + i) &&
-		    !PageReserved(pfn_to_page(start_pfn + i)))
-			return 1;
+	if ((res->flags & IORESOURCE_SYSTEM_RAM) != IORESOURCE_SYSTEM_RAM)
+		return false;
 
-	return 0;
+	start_pfn = (res->start + PAGE_SIZE - 1) >> PAGE_SHIFT;
+	stop_pfn = (res->end + 1) >> PAGE_SHIFT;
+	if (stop_pfn > start_pfn) {
+		for (i = 0; i < (stop_pfn - start_pfn); ++i)
+			if (pfn_valid(start_pfn + i) &&
+			    !PageReserved(pfn_to_page(start_pfn + i)))
+				return true;
+	}
+
+	return false;
+}
+
+static int __ioremap_check_desc_other(struct resource *res)
+{
+	return (res->desc != IORES_DESC_NONE);
+}
+
+static int __ioremap_res_check(struct resource *res, void *arg)
+{
+	struct ioremap_mem_flags *flags = arg;
+
+	if (!flags->system_ram)
+		flags->system_ram = __ioremap_check_ram(res);
+
+	if (!flags->desc_other)
+		flags->desc_other = __ioremap_check_desc_other(res);
+
+	return flags->system_ram && flags->desc_other;
+}
+
+/*
+ * To avoid multiple resource walks, this function walks resources marked as
+ * IORESOURCE_MEM and IORESOURCE_BUSY and looking for system RAM and/or a
+ * resource described not as IORES_DESC_NONE (e.g. IORES_DESC_ACPI_TABLES).
+ */
+static void __ioremap_check_mem(resource_size_t addr, unsigned long size,
+				struct ioremap_mem_flags *flags)
+{
+	u64 start, end;
+
+	start = (u64)addr;
+	end = start + size - 1;
+	memset(flags, 0, sizeof(*flags));
+
+	walk_mem_res(start, end, flags, __ioremap_res_check);
 }
 
 /*
@@ -87,9 +134,10 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		unsigned long size, enum page_cache_mode pcm, void *caller)
 {
 	unsigned long offset, vaddr;
-	resource_size_t pfn, last_pfn, last_addr;
+	resource_size_t last_addr;
 	const resource_size_t unaligned_phys_addr = phys_addr;
 	const unsigned long unaligned_size = size;
+	struct ioremap_mem_flags mem_flags;
 	struct vm_struct *area;
 	enum page_cache_mode new_pcm;
 	pgprot_t prot;
@@ -108,13 +156,12 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		return NULL;
 	}
 
+	__ioremap_check_mem(phys_addr, size, &mem_flags);
+
 	/*
 	 * Don't allow anybody to remap normal RAM that we're using..
 	 */
-	pfn      = phys_addr >> PAGE_SHIFT;
-	last_pfn = last_addr >> PAGE_SHIFT;
-	if (walk_system_ram_range(pfn, last_pfn - pfn + 1, NULL,
-					  __ioremap_check_ram) == 1) {
+	if (mem_flags.system_ram) {
 		WARN_ONCE(1, "ioremap on RAM at %pa - %pa\n",
 			  &phys_addr, &last_addr);
 		return NULL;
@@ -146,7 +193,15 @@ static void __iomem *__ioremap_caller(resource_size_t phys_addr,
 		pcm = new_pcm;
 	}
 
+	/*
+	 * If the page being mapped is in memory and SEV is active then
+	 * make sure the memory encryption attribute is enabled in the
+	 * resulting mapping.
+	 */
 	prot = PAGE_KERNEL_IO;
+	if (sev_active() && mem_flags.desc_other)
+		prot = pgprot_encrypted(prot);
+
 	switch (pcm) {
 	case _PAGE_CACHE_MODE_UC:
 	default:
@@ -422,6 +477,9 @@ void unxlate_dev_mem_ptr(phys_addr_t phys, void *addr)
  * areas should be mapped decrypted. And since the encryption key can
  * change across reboots, persistent memory should also be mapped
  * decrypted.
+ *
+ * If SEV is active, that implies that BIOS/UEFI also ran encrypted so
+ * only persistent memory should be mapped decrypted.
  */
 static bool memremap_should_map_decrypted(resource_size_t phys_addr,
 					  unsigned long size)
@@ -458,6 +516,11 @@ static bool memremap_should_map_decrypted(resource_size_t phys_addr,
 	case E820_TYPE_ACPI:
 	case E820_TYPE_NVS:
 	case E820_TYPE_UNUSABLE:
+		/* For SEV, these areas are encrypted */
+		if (sev_active())
+			break;
+		/* Fallthrough */
+
 	case E820_TYPE_PRAM:
 		return true;
 	default:
@@ -581,7 +644,7 @@ static bool __init early_memremap_is_setup_data(resource_size_t phys_addr,
 bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
 				 unsigned long flags)
 {
-	if (!sme_active())
+	if (!mem_encrypt_active())
 		return true;
 
 	if (flags & MEMREMAP_ENC)
@@ -590,12 +653,13 @@ bool arch_memremap_can_ram_remap(resource_size_t phys_addr, unsigned long size,
 	if (flags & MEMREMAP_DEC)
 		return false;
 
-	if (memremap_is_setup_data(phys_addr, size) ||
-	    memremap_is_efi_data(phys_addr, size) ||
-	    memremap_should_map_decrypted(phys_addr, size))
-		return false;
+	if (sme_active()) {
+		if (memremap_is_setup_data(phys_addr, size) ||
+		    memremap_is_efi_data(phys_addr, size))
+			return false;
+	}
 
-	return true;
+	return !memremap_should_map_decrypted(phys_addr, size);
 }
 
 /*
@@ -608,17 +672,24 @@ pgprot_t __init early_memremap_pgprot_adjust(resource_size_t phys_addr,
 					     unsigned long size,
 					     pgprot_t prot)
 {
-	if (!sme_active())
+	bool encrypted_prot;
+
+	if (!mem_encrypt_active())
 		return prot;
 
-	if (early_memremap_is_setup_data(phys_addr, size) ||
-	    memremap_is_efi_data(phys_addr, size) ||
-	    memremap_should_map_decrypted(phys_addr, size))
-		prot = pgprot_decrypted(prot);
-	else
-		prot = pgprot_encrypted(prot);
+	encrypted_prot = true;
+
+	if (sme_active()) {
+		if (early_memremap_is_setup_data(phys_addr, size) ||
+		    memremap_is_efi_data(phys_addr, size))
+			encrypted_prot = false;
+	}
+
+	if (encrypted_prot && memremap_should_map_decrypted(phys_addr, size))
+		encrypted_prot = false;
 
-	return prot;
+	return encrypted_prot ? pgprot_encrypted(prot)
+			      : pgprot_decrypted(prot);
 }
 
 bool phys_mem_access_encrypted(unsigned long phys_addr, unsigned long size)
diff --git a/arch/x86/mm/kasan_init_64.c b/arch/x86/mm/kasan_init_64.c
index 8f5be3eb40dd..2b60dc6e64b1 100644
--- a/arch/x86/mm/kasan_init_64.c
+++ b/arch/x86/mm/kasan_init_64.c
@@ -16,6 +16,8 @@
 
 extern struct range pfn_mapped[E820_MAX_ENTRIES];
 
+static p4d_t tmp_p4d_table[PTRS_PER_P4D] __initdata __aligned(PAGE_SIZE);
+
 static int __init map_range(struct range *range)
 {
 	unsigned long start;
@@ -31,8 +33,10 @@ static void __init clear_pgds(unsigned long start,
 			unsigned long end)
 {
 	pgd_t *pgd;
+	/* See comment in kasan_init() */
+	unsigned long pgd_end = end & PGDIR_MASK;
 
-	for (; start < end; start += PGDIR_SIZE) {
+	for (; start < pgd_end; start += PGDIR_SIZE) {
 		pgd = pgd_offset_k(start);
 		/*
 		 * With folded p4d, pgd_clear() is nop, use p4d_clear()
@@ -43,29 +47,61 @@ static void __init clear_pgds(unsigned long start,
 		else
 			pgd_clear(pgd);
 	}
+
+	pgd = pgd_offset_k(start);
+	for (; start < end; start += P4D_SIZE)
+		p4d_clear(p4d_offset(pgd, start));
+}
+
+static inline p4d_t *early_p4d_offset(pgd_t *pgd, unsigned long addr)
+{
+	unsigned long p4d;
+
+	if (!IS_ENABLED(CONFIG_X86_5LEVEL))
+		return (p4d_t *)pgd;
+
+	p4d = __pa_nodebug(pgd_val(*pgd)) & PTE_PFN_MASK;
+	p4d += __START_KERNEL_map - phys_base;
+	return (p4d_t *)p4d + p4d_index(addr);
+}
+
+static void __init kasan_early_p4d_populate(pgd_t *pgd,
+		unsigned long addr,
+		unsigned long end)
+{
+	pgd_t pgd_entry;
+	p4d_t *p4d, p4d_entry;
+	unsigned long next;
+
+	if (pgd_none(*pgd)) {
+		pgd_entry = __pgd(_KERNPG_TABLE | __pa_nodebug(kasan_zero_p4d));
+		set_pgd(pgd, pgd_entry);
+	}
+
+	p4d = early_p4d_offset(pgd, addr);
+	do {
+		next = p4d_addr_end(addr, end);
+
+		if (!p4d_none(*p4d))
+			continue;
+
+		p4d_entry = __p4d(_KERNPG_TABLE | __pa_nodebug(kasan_zero_pud));
+		set_p4d(p4d, p4d_entry);
+	} while (p4d++, addr = next, addr != end && p4d_none(*p4d));
 }
 
 static void __init kasan_map_early_shadow(pgd_t *pgd)
 {
-	int i;
-	unsigned long start = KASAN_SHADOW_START;
+	/* See comment in kasan_init() */
+	unsigned long addr = KASAN_SHADOW_START & PGDIR_MASK;
 	unsigned long end = KASAN_SHADOW_END;
+	unsigned long next;
 
-	for (i = pgd_index(start); start < end; i++) {
-		switch (CONFIG_PGTABLE_LEVELS) {
-		case 4:
-			pgd[i] = __pgd(__pa_nodebug(kasan_zero_pud) |
-					_KERNPG_TABLE);
-			break;
-		case 5:
-			pgd[i] = __pgd(__pa_nodebug(kasan_zero_p4d) |
-					_KERNPG_TABLE);
-			break;
-		default:
-			BUILD_BUG();
-		}
-		start += PGDIR_SIZE;
-	}
+	pgd += pgd_index(addr);
+	do {
+		next = pgd_addr_end(addr, end);
+		kasan_early_p4d_populate(pgd, addr, next);
+	} while (pgd++, addr = next, addr != end);
 }
 
 #ifdef CONFIG_KASAN_INLINE
@@ -102,7 +138,7 @@ void __init kasan_early_init(void)
 	for (i = 0; i < PTRS_PER_PUD; i++)
 		kasan_zero_pud[i] = __pud(pud_val);
 
-	for (i = 0; CONFIG_PGTABLE_LEVELS >= 5 && i < PTRS_PER_P4D; i++)
+	for (i = 0; IS_ENABLED(CONFIG_X86_5LEVEL) && i < PTRS_PER_P4D; i++)
 		kasan_zero_p4d[i] = __p4d(p4d_val);
 
 	kasan_map_early_shadow(early_top_pgt);
@@ -118,12 +154,35 @@ void __init kasan_init(void)
 #endif
 
 	memcpy(early_top_pgt, init_top_pgt, sizeof(early_top_pgt));
+
+	/*
+	 * We use the same shadow offset for 4- and 5-level paging to
+	 * facilitate boot-time switching between paging modes.
+	 * As result in 5-level paging mode KASAN_SHADOW_START and
+	 * KASAN_SHADOW_END are not aligned to PGD boundary.
+	 *
+	 * KASAN_SHADOW_START doesn't share PGD with anything else.
+	 * We claim whole PGD entry to make things easier.
+	 *
+	 * KASAN_SHADOW_END lands in the last PGD entry and it collides with
+	 * bunch of things like kernel code, modules, EFI mapping, etc.
+	 * We need to take extra steps to not overwrite them.
+	 */
+	if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
+		void *ptr;
+
+		ptr = (void *)pgd_page_vaddr(*pgd_offset_k(KASAN_SHADOW_END));
+		memcpy(tmp_p4d_table, (void *)ptr, sizeof(tmp_p4d_table));
+		set_pgd(&early_top_pgt[pgd_index(KASAN_SHADOW_END)],
+				__pgd(__pa(tmp_p4d_table) | _KERNPG_TABLE));
+	}
+
 	load_cr3(early_top_pgt);
 	__flush_tlb_all();
 
-	clear_pgds(KASAN_SHADOW_START, KASAN_SHADOW_END);
+	clear_pgds(KASAN_SHADOW_START & PGDIR_MASK, KASAN_SHADOW_END);
 
-	kasan_populate_zero_shadow((void *)KASAN_SHADOW_START,
+	kasan_populate_zero_shadow((void *)(KASAN_SHADOW_START & PGDIR_MASK),
 			kasan_mem_to_shadow((void *)PAGE_OFFSET));
 
 	for (i = 0; i < E820_MAX_ENTRIES; i++) {
diff --git a/arch/x86/mm/mem_encrypt.c b/arch/x86/mm/mem_encrypt.c
index 0286327e65fa..d9a9e9fc75dd 100644
--- a/arch/x86/mm/mem_encrypt.c
+++ b/arch/x86/mm/mem_encrypt.c
@@ -30,6 +30,8 @@
 #include <asm/msr.h>
 #include <asm/cmdline.h>
 
+#include "mm_internal.h"
+
 static char sme_cmdline_arg[] __initdata = "mem_encrypt";
 static char sme_cmdline_on[]  __initdata = "on";
 static char sme_cmdline_off[] __initdata = "off";
@@ -41,6 +43,10 @@ static char sme_cmdline_off[] __initdata = "off";
  */
 u64 sme_me_mask __section(.data) = 0;
 EXPORT_SYMBOL(sme_me_mask);
+DEFINE_STATIC_KEY_FALSE(sev_enable_key);
+EXPORT_SYMBOL_GPL(sev_enable_key);
+
+static bool sev_enabled __section(.data);
 
 /* Buffer used for early in-place encryption by BSP, no locking needed */
 static char sme_early_buffer[PAGE_SIZE] __aligned(PAGE_SIZE);
@@ -63,7 +69,6 @@ static void __init __sme_early_enc_dec(resource_size_t paddr,
 	if (!sme_me_mask)
 		return;
 
-	local_flush_tlb();
 	wbinvd();
 
 	/*
@@ -190,8 +195,238 @@ void __init sme_early_init(void)
 	/* Update the protection map with memory encryption mask */
 	for (i = 0; i < ARRAY_SIZE(protection_map); i++)
 		protection_map[i] = pgprot_encrypted(protection_map[i]);
+
+	if (sev_active())
+		swiotlb_force = SWIOTLB_FORCE;
+}
+
+static void *sev_alloc(struct device *dev, size_t size, dma_addr_t *dma_handle,
+		       gfp_t gfp, unsigned long attrs)
+{
+	unsigned long dma_mask;
+	unsigned int order;
+	struct page *page;
+	void *vaddr = NULL;
+
+	dma_mask = dma_alloc_coherent_mask(dev, gfp);
+	order = get_order(size);
+
+	/*
+	 * Memory will be memset to zero after marking decrypted, so don't
+	 * bother clearing it before.
+	 */
+	gfp &= ~__GFP_ZERO;
+
+	page = alloc_pages_node(dev_to_node(dev), gfp, order);
+	if (page) {
+		dma_addr_t addr;
+
+		/*
+		 * Since we will be clearing the encryption bit, check the
+		 * mask with it already cleared.
+		 */
+		addr = __sme_clr(phys_to_dma(dev, page_to_phys(page)));
+		if ((addr + size) > dma_mask) {
+			__free_pages(page, get_order(size));
+		} else {
+			vaddr = page_address(page);
+			*dma_handle = addr;
+		}
+	}
+
+	if (!vaddr)
+		vaddr = swiotlb_alloc_coherent(dev, size, dma_handle, gfp);
+
+	if (!vaddr)
+		return NULL;
+
+	/* Clear the SME encryption bit for DMA use if not swiotlb area */
+	if (!is_swiotlb_buffer(dma_to_phys(dev, *dma_handle))) {
+		set_memory_decrypted((unsigned long)vaddr, 1 << order);
+		memset(vaddr, 0, PAGE_SIZE << order);
+		*dma_handle = __sme_clr(*dma_handle);
+	}
+
+	return vaddr;
 }
 
+static void sev_free(struct device *dev, size_t size, void *vaddr,
+		     dma_addr_t dma_handle, unsigned long attrs)
+{
+	/* Set the SME encryption bit for re-use if not swiotlb area */
+	if (!is_swiotlb_buffer(dma_to_phys(dev, dma_handle)))
+		set_memory_encrypted((unsigned long)vaddr,
+				     1 << get_order(size));
+
+	swiotlb_free_coherent(dev, size, vaddr, dma_handle);
+}
+
+static void __init __set_clr_pte_enc(pte_t *kpte, int level, bool enc)
+{
+	pgprot_t old_prot, new_prot;
+	unsigned long pfn, pa, size;
+	pte_t new_pte;
+
+	switch (level) {
+	case PG_LEVEL_4K:
+		pfn = pte_pfn(*kpte);
+		old_prot = pte_pgprot(*kpte);
+		break;
+	case PG_LEVEL_2M:
+		pfn = pmd_pfn(*(pmd_t *)kpte);
+		old_prot = pmd_pgprot(*(pmd_t *)kpte);
+		break;
+	case PG_LEVEL_1G:
+		pfn = pud_pfn(*(pud_t *)kpte);
+		old_prot = pud_pgprot(*(pud_t *)kpte);
+		break;
+	default:
+		return;
+	}
+
+	new_prot = old_prot;
+	if (enc)
+		pgprot_val(new_prot) |= _PAGE_ENC;
+	else
+		pgprot_val(new_prot) &= ~_PAGE_ENC;
+
+	/* If prot is same then do nothing. */
+	if (pgprot_val(old_prot) == pgprot_val(new_prot))
+		return;
+
+	pa = pfn << page_level_shift(level);
+	size = page_level_size(level);
+
+	/*
+	 * We are going to perform in-place en-/decryption and change the
+	 * physical page attribute from C=1 to C=0 or vice versa. Flush the
+	 * caches to ensure that data gets accessed with the correct C-bit.
+	 */
+	clflush_cache_range(__va(pa), size);
+
+	/* Encrypt/decrypt the contents in-place */
+	if (enc)
+		sme_early_encrypt(pa, size);
+	else
+		sme_early_decrypt(pa, size);
+
+	/* Change the page encryption mask. */
+	new_pte = pfn_pte(pfn, new_prot);
+	set_pte_atomic(kpte, new_pte);
+}
+
+static int __init early_set_memory_enc_dec(unsigned long vaddr,
+					   unsigned long size, bool enc)
+{
+	unsigned long vaddr_end, vaddr_next;
+	unsigned long psize, pmask;
+	int split_page_size_mask;
+	int level, ret;
+	pte_t *kpte;
+
+	vaddr_next = vaddr;
+	vaddr_end = vaddr + size;
+
+	for (; vaddr < vaddr_end; vaddr = vaddr_next) {
+		kpte = lookup_address(vaddr, &level);
+		if (!kpte || pte_none(*kpte)) {
+			ret = 1;
+			goto out;
+		}
+
+		if (level == PG_LEVEL_4K) {
+			__set_clr_pte_enc(kpte, level, enc);
+			vaddr_next = (vaddr & PAGE_MASK) + PAGE_SIZE;
+			continue;
+		}
+
+		psize = page_level_size(level);
+		pmask = page_level_mask(level);
+
+		/*
+		 * Check whether we can change the large page in one go.
+		 * We request a split when the address is not aligned and
+		 * the number of pages to set/clear encryption bit is smaller
+		 * than the number of pages in the large page.
+		 */
+		if (vaddr == (vaddr & pmask) &&
+		    ((vaddr_end - vaddr) >= psize)) {
+			__set_clr_pte_enc(kpte, level, enc);
+			vaddr_next = (vaddr & pmask) + psize;
+			continue;
+		}
+
+		/*
+		 * The virtual address is part of a larger page, create the next
+		 * level page table mapping (4K or 2M). If it is part of a 2M
+		 * page then we request a split of the large page into 4K
+		 * chunks. A 1GB large page is split into 2M pages, resp.
+		 */
+		if (level == PG_LEVEL_2M)
+			split_page_size_mask = 0;
+		else
+			split_page_size_mask = 1 << PG_LEVEL_2M;
+
+		kernel_physical_mapping_init(__pa(vaddr & pmask),
+					     __pa((vaddr_end & pmask) + psize),
+					     split_page_size_mask);
+	}
+
+	ret = 0;
+
+out:
+	__flush_tlb_all();
+	return ret;
+}
+
+int __init early_set_memory_decrypted(unsigned long vaddr, unsigned long size)
+{
+	return early_set_memory_enc_dec(vaddr, size, false);
+}
+
+int __init early_set_memory_encrypted(unsigned long vaddr, unsigned long size)
+{
+	return early_set_memory_enc_dec(vaddr, size, true);
+}
+
+/*
+ * SME and SEV are very similar but they are not the same, so there are
+ * times that the kernel will need to distinguish between SME and SEV. The
+ * sme_active() and sev_active() functions are used for this.  When a
+ * distinction isn't needed, the mem_encrypt_active() function can be used.
+ *
+ * The trampoline code is a good example for this requirement.  Before
+ * paging is activated, SME will access all memory as decrypted, but SEV
+ * will access all memory as encrypted.  So, when APs are being brought
+ * up under SME the trampoline area cannot be encrypted, whereas under SEV
+ * the trampoline area must be encrypted.
+ */
+bool sme_active(void)
+{
+	return sme_me_mask && !sev_enabled;
+}
+EXPORT_SYMBOL_GPL(sme_active);
+
+bool sev_active(void)
+{
+	return sme_me_mask && sev_enabled;
+}
+EXPORT_SYMBOL_GPL(sev_active);
+
+static const struct dma_map_ops sev_dma_ops = {
+	.alloc                  = sev_alloc,
+	.free                   = sev_free,
+	.map_page               = swiotlb_map_page,
+	.unmap_page             = swiotlb_unmap_page,
+	.map_sg                 = swiotlb_map_sg_attrs,
+	.unmap_sg               = swiotlb_unmap_sg_attrs,
+	.sync_single_for_cpu    = swiotlb_sync_single_for_cpu,
+	.sync_single_for_device = swiotlb_sync_single_for_device,
+	.sync_sg_for_cpu        = swiotlb_sync_sg_for_cpu,
+	.sync_sg_for_device     = swiotlb_sync_sg_for_device,
+	.mapping_error          = swiotlb_dma_mapping_error,
+};
+
 /* Architecture __weak replacement functions */
 void __init mem_encrypt_init(void)
 {
@@ -201,7 +436,23 @@ void __init mem_encrypt_init(void)
 	/* Call into SWIOTLB to update the SWIOTLB DMA buffers */
 	swiotlb_update_mem_attributes();
 
-	pr_info("AMD Secure Memory Encryption (SME) active\n");
+	/*
+	 * With SEV, DMA operations cannot use encryption. New DMA ops
+	 * are required in order to mark the DMA areas as decrypted or
+	 * to use bounce buffers.
+	 */
+	if (sev_active())
+		dma_ops = &sev_dma_ops;
+
+	/*
+	 * With SEV, we need to unroll the rep string I/O instructions.
+	 */
+	if (sev_active())
+		static_branch_enable(&sev_enable_key);
+
+	pr_info("AMD %s active\n",
+		sev_active() ? "Secure Encrypted Virtualization (SEV)"
+			     : "Secure Memory Encryption (SME)");
 }
 
 void swiotlb_set_mem_attributes(void *vaddr, unsigned long size)
@@ -529,37 +780,63 @@ void __init __nostackprotector sme_enable(struct boot_params *bp)
 {
 	const char *cmdline_ptr, *cmdline_arg, *cmdline_on, *cmdline_off;
 	unsigned int eax, ebx, ecx, edx;
+	unsigned long feature_mask;
 	bool active_by_default;
 	unsigned long me_mask;
 	char buffer[16];
 	u64 msr;
 
-	/* Check for the SME support leaf */
+	/* Check for the SME/SEV support leaf */
 	eax = 0x80000000;
 	ecx = 0;
 	native_cpuid(&eax, &ebx, &ecx, &edx);
 	if (eax < 0x8000001f)
 		return;
 
+#define AMD_SME_BIT	BIT(0)
+#define AMD_SEV_BIT	BIT(1)
 	/*
-	 * Check for the SME feature:
-	 *   CPUID Fn8000_001F[EAX] - Bit 0
-	 *     Secure Memory Encryption support
-	 *   CPUID Fn8000_001F[EBX] - Bits 5:0
-	 *     Pagetable bit position used to indicate encryption
+	 * Set the feature mask (SME or SEV) based on whether we are
+	 * running under a hypervisor.
+	 */
+	eax = 1;
+	ecx = 0;
+	native_cpuid(&eax, &ebx, &ecx, &edx);
+	feature_mask = (ecx & BIT(31)) ? AMD_SEV_BIT : AMD_SME_BIT;
+
+	/*
+	 * Check for the SME/SEV feature:
+	 *   CPUID Fn8000_001F[EAX]
+	 *   - Bit 0 - Secure Memory Encryption support
+	 *   - Bit 1 - Secure Encrypted Virtualization support
+	 *   CPUID Fn8000_001F[EBX]
+	 *   - Bits 5:0 - Pagetable bit position used to indicate encryption
 	 */
 	eax = 0x8000001f;
 	ecx = 0;
 	native_cpuid(&eax, &ebx, &ecx, &edx);
-	if (!(eax & 1))
+	if (!(eax & feature_mask))
 		return;
 
 	me_mask = 1UL << (ebx & 0x3f);
 
-	/* Check if SME is enabled */
-	msr = __rdmsr(MSR_K8_SYSCFG);
-	if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+	/* Check if memory encryption is enabled */
+	if (feature_mask == AMD_SME_BIT) {
+		/* For SME, check the SYSCFG MSR */
+		msr = __rdmsr(MSR_K8_SYSCFG);
+		if (!(msr & MSR_K8_SYSCFG_MEM_ENCRYPT))
+			return;
+	} else {
+		/* For SEV, check the SEV MSR */
+		msr = __rdmsr(MSR_AMD64_SEV);
+		if (!(msr & MSR_AMD64_SEV_ENABLED))
+			return;
+
+		/* SEV state cannot be controlled by a command line option */
+		sme_me_mask = me_mask;
+		sev_enabled = true;
 		return;
+	}
 
 	/*
 	 * Fixups have not been applied to phys_base yet and we're running
diff --git a/arch/x86/mm/mpx.c b/arch/x86/mm/mpx.c
index 7eb06701a935..e500949bae24 100644
--- a/arch/x86/mm/mpx.c
+++ b/arch/x86/mm/mpx.c
@@ -13,6 +13,7 @@
 #include <linux/sched/sysctl.h>
 
 #include <asm/insn.h>
+#include <asm/insn-eval.h>
 #include <asm/mman.h>
 #include <asm/mmu_context.h>
 #include <asm/mpx.h>
@@ -61,123 +62,6 @@ static unsigned long mpx_mmap(unsigned long len)
 	return addr;
 }
 
-enum reg_type {
-	REG_TYPE_RM = 0,
-	REG_TYPE_INDEX,
-	REG_TYPE_BASE,
-};
-
-static int get_reg_offset(struct insn *insn, struct pt_regs *regs,
-			  enum reg_type type)
-{
-	int regno = 0;
-
-	static const int regoff[] = {
-		offsetof(struct pt_regs, ax),
-		offsetof(struct pt_regs, cx),
-		offsetof(struct pt_regs, dx),
-		offsetof(struct pt_regs, bx),
-		offsetof(struct pt_regs, sp),
-		offsetof(struct pt_regs, bp),
-		offsetof(struct pt_regs, si),
-		offsetof(struct pt_regs, di),
-#ifdef CONFIG_X86_64
-		offsetof(struct pt_regs, r8),
-		offsetof(struct pt_regs, r9),
-		offsetof(struct pt_regs, r10),
-		offsetof(struct pt_regs, r11),
-		offsetof(struct pt_regs, r12),
-		offsetof(struct pt_regs, r13),
-		offsetof(struct pt_regs, r14),
-		offsetof(struct pt_regs, r15),
-#endif
-	};
-	int nr_registers = ARRAY_SIZE(regoff);
-	/*
-	 * Don't possibly decode a 32-bit instructions as
-	 * reading a 64-bit-only register.
-	 */
-	if (IS_ENABLED(CONFIG_X86_64) && !insn->x86_64)
-		nr_registers -= 8;
-
-	switch (type) {
-	case REG_TYPE_RM:
-		regno = X86_MODRM_RM(insn->modrm.value);
-		if (X86_REX_B(insn->rex_prefix.value))
-			regno += 8;
-		break;
-
-	case REG_TYPE_INDEX:
-		regno = X86_SIB_INDEX(insn->sib.value);
-		if (X86_REX_X(insn->rex_prefix.value))
-			regno += 8;
-		break;
-
-	case REG_TYPE_BASE:
-		regno = X86_SIB_BASE(insn->sib.value);
-		if (X86_REX_B(insn->rex_prefix.value))
-			regno += 8;
-		break;
-
-	default:
-		pr_err("invalid register type");
-		BUG();
-		break;
-	}
-
-	if (regno >= nr_registers) {
-		WARN_ONCE(1, "decoded an instruction with an invalid register");
-		return -EINVAL;
-	}
-	return regoff[regno];
-}
-
-/*
- * return the address being referenced be instruction
- * for rm=3 returning the content of the rm reg
- * for rm!=3 calculates the address using SIB and Disp
- */
-static void __user *mpx_get_addr_ref(struct insn *insn, struct pt_regs *regs)
-{
-	unsigned long addr, base, indx;
-	int addr_offset, base_offset, indx_offset;
-	insn_byte_t sib;
-
-	insn_get_modrm(insn);
-	insn_get_sib(insn);
-	sib = insn->sib.value;
-
-	if (X86_MODRM_MOD(insn->modrm.value) == 3) {
-		addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM);
-		if (addr_offset < 0)
-			goto out_err;
-		addr = regs_get_register(regs, addr_offset);
-	} else {
-		if (insn->sib.nbytes) {
-			base_offset = get_reg_offset(insn, regs, REG_TYPE_BASE);
-			if (base_offset < 0)
-				goto out_err;
-
-			indx_offset = get_reg_offset(insn, regs, REG_TYPE_INDEX);
-			if (indx_offset < 0)
-				goto out_err;
-
-			base = regs_get_register(regs, base_offset);
-			indx = regs_get_register(regs, indx_offset);
-			addr = base + indx * (1 << X86_SIB_SCALE(sib));
-		} else {
-			addr_offset = get_reg_offset(insn, regs, REG_TYPE_RM);
-			if (addr_offset < 0)
-				goto out_err;
-			addr = regs_get_register(regs, addr_offset);
-		}
-		addr += insn->displacement.value;
-	}
-	return (void __user *)addr;
-out_err:
-	return (void __user *)-1;
-}
-
 static int mpx_insn_decode(struct insn *insn,
 			   struct pt_regs *regs)
 {
@@ -290,7 +174,7 @@ siginfo_t *mpx_generate_siginfo(struct pt_regs *regs)
 	info->si_signo = SIGSEGV;
 	info->si_errno = 0;
 	info->si_code = SEGV_BNDERR;
-	info->si_addr = mpx_get_addr_ref(&insn, regs);
+	info->si_addr = insn_get_addr_ref(&insn, regs);
 	/*
 	 * We were not able to extract an address from the instruction,
 	 * probably because there was something invalid in it.
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index dfb7d657cf43..3fe68483463c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1781,8 +1781,8 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
 	unsigned long start;
 	int ret;
 
-	/* Nothing to do if the SME is not active */
-	if (!sme_active())
+	/* Nothing to do if memory encryption is not active */
+	if (!mem_encrypt_active())
 		return 0;
 
 	/* Should not be working on unaligned addresses */
diff --git a/arch/x86/platform/efi/efi_64.c b/arch/x86/platform/efi/efi_64.c
index 20fb31579b69..9e4ee5b04b2d 100644
--- a/arch/x86/platform/efi/efi_64.c
+++ b/arch/x86/platform/efi/efi_64.c
@@ -33,6 +33,7 @@
 #include <linux/reboot.h>
 #include <linux/slab.h>
 #include <linux/ucs2_string.h>
+#include <linux/mem_encrypt.h>
 
 #include <asm/setup.h>
 #include <asm/page.h>
@@ -370,7 +371,11 @@ int __init efi_setup_page_tables(unsigned long pa_memmap, unsigned num_pages)
 	 * as trim_bios_range() will reserve the first page and isolate it away
 	 * from memory allocators anyway.
 	 */
-	if (kernel_map_pages_in_pgd(pgd, 0x0, 0x0, 1, _PAGE_RW)) {
+	pf = _PAGE_RW;
+	if (sev_active())
+		pf |= _PAGE_ENC;
+
+	if (kernel_map_pages_in_pgd(pgd, 0x0, 0x0, 1, pf)) {
 		pr_err("Failed to create 1:1 mapping for the first page!\n");
 		return 1;
 	}
@@ -413,6 +418,9 @@ static void __init __map_region(efi_memory_desc_t *md, u64 va)
 	if (!(md->attribute & EFI_MEMORY_WB))
 		flags |= _PAGE_PCD;
 
+	if (sev_active())
+		flags |= _PAGE_ENC;
+
 	pfn = md->phys_addr >> PAGE_SHIFT;
 	if (kernel_map_pages_in_pgd(pgd, pfn, va, md->num_pages, flags))
 		pr_warn("Error mapping PA 0x%llx -> VA 0x%llx!\n",
@@ -539,6 +547,9 @@ static int __init efi_update_mem_attr(struct mm_struct *mm, efi_memory_desc_t *m
 	if (!(md->attribute & EFI_MEMORY_RO))
 		pf |= _PAGE_RW;
 
+	if (sev_active())
+		pf |= _PAGE_ENC;
+
 	return efi_update_mappings(md, pf);
 }
 
@@ -590,6 +601,9 @@ void __init efi_runtime_update_mappings(void)
 			(md->type != EFI_RUNTIME_SERVICES_CODE))
 			pf |= _PAGE_RW;
 
+		if (sev_active())
+			pf |= _PAGE_ENC;
+
 		efi_update_mappings(md, pf);
 	}
 }
diff --git a/arch/x86/realmode/init.c b/arch/x86/realmode/init.c
index ed84d3917a59..d10105825d57 100644
--- a/arch/x86/realmode/init.c
+++ b/arch/x86/realmode/init.c
@@ -64,9 +64,10 @@ static void __init setup_real_mode(void)
 	/*
 	 * If SME is active, the trampoline area will need to be in
 	 * decrypted memory in order to bring up other processors
-	 * successfully.
+	 * successfully. This is not needed for SEV.
 	 */
-	set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT);
+	if (sme_active())
+		set_memory_decrypted((unsigned long)base, size >> PAGE_SHIFT);
 
 	memcpy(base, real_mode_blob, size);
 
diff --git a/arch/x86/um/ldt.c b/arch/x86/um/ldt.c
index 836a1eb5df43..3ee234b6234d 100644
--- a/arch/x86/um/ldt.c
+++ b/arch/x86/um/ldt.c
@@ -6,6 +6,7 @@
 #include <linux/mm.h>
 #include <linux/sched.h>
 #include <linux/slab.h>
+#include <linux/syscalls.h>
 #include <linux/uaccess.h>
 #include <asm/unistd.h>
 #include <os.h>
@@ -369,7 +370,9 @@ void free_ldt(struct mm_context *mm)
 	mm->arch.ldt.entry_count = 0;
 }
 
-int sys_modify_ldt(int func, void __user *ptr, unsigned long bytecount)
+SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr ,
+		unsigned long , bytecount)
 {
-	return do_modify_ldt_skas(func, ptr, bytecount);
+	/* See non-um modify_ldt() for why we do this cast */
+	return (unsigned int)do_modify_ldt_skas(func, ptr, bytecount);
 }
diff --git a/arch/x86/xen/enlighten_pv.c b/arch/x86/xen/enlighten_pv.c
index d4396e27b1fb..e55d276afc70 100644
--- a/arch/x86/xen/enlighten_pv.c
+++ b/arch/x86/xen/enlighten_pv.c
@@ -601,7 +601,7 @@ static struct trap_array_entry trap_array[] = {
 #ifdef CONFIG_X86_MCE
 	{ machine_check,               xen_machine_check,               true },
 #endif
-	{ nmi,                         xen_nmi,                         true },
+	{ nmi,                         xen_xennmi,                      true },
 	{ overflow,                    xen_overflow,                    false },
 #ifdef CONFIG_IA32_EMULATION
 	{ entry_INT80_compat,          xen_entry_INT80_compat,          false },
@@ -811,15 +811,14 @@ static void __init xen_write_gdt_entry_boot(struct desc_struct *dt, int entry,
 	}
 }
 
-static void xen_load_sp0(struct tss_struct *tss,
-			 struct thread_struct *thread)
+static void xen_load_sp0(unsigned long sp0)
 {
 	struct multicall_space mcs;
 
 	mcs = xen_mc_entry(0);
-	MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
+	MULTI_stack_switch(mcs.mc, __KERNEL_DS, sp0);
 	xen_mc_issue(PARAVIRT_LAZY_CPU);
-	tss->x86_tss.sp0 = thread->sp0;
+	this_cpu_write(cpu_tss.x86_tss.sp0, sp0);
 }
 
 void xen_set_iopl_mask(unsigned mask)
diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c
index 71495f1a86d7..2ccdaba31a07 100644
--- a/arch/x86/xen/mmu_pv.c
+++ b/arch/x86/xen/mmu_pv.c
@@ -449,7 +449,7 @@ __visible pmd_t xen_make_pmd(pmdval_t pmd)
 }
 PV_CALLEE_SAVE_REGS_THUNK(xen_make_pmd);
 
-#if CONFIG_PGTABLE_LEVELS == 4
+#ifdef CONFIG_X86_64
 __visible pudval_t xen_pud_val(pud_t pud)
 {
 	return pte_mfn_to_pfn(pud.pud);
@@ -538,7 +538,7 @@ static void xen_set_p4d(p4d_t *ptr, p4d_t val)
 
 	xen_mc_issue(PARAVIRT_LAZY_MMU);
 }
-#endif	/* CONFIG_PGTABLE_LEVELS == 4 */
+#endif	/* CONFIG_X86_64 */
 
 static int xen_pmd_walk(struct mm_struct *mm, pmd_t *pmd,
 		int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
@@ -580,21 +580,17 @@ static int xen_p4d_walk(struct mm_struct *mm, p4d_t *p4d,
 		int (*func)(struct mm_struct *mm, struct page *, enum pt_level),
 		bool last, unsigned long limit)
 {
-	int i, nr, flush = 0;
+	int flush = 0;
+	pud_t *pud;
 
-	nr = last ? p4d_index(limit) + 1 : PTRS_PER_P4D;
-	for (i = 0; i < nr; i++) {
-		pud_t *pud;
 
-		if (p4d_none(p4d[i]))
-			continue;
+	if (p4d_none(*p4d))
+		return flush;
 
-		pud = pud_offset(&p4d[i], 0);
-		if (PTRS_PER_PUD > 1)
-			flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
-		flush |= xen_pud_walk(mm, pud, func,
-				last && i == nr - 1, limit);
-	}
+	pud = pud_offset(p4d, 0);
+	if (PTRS_PER_PUD > 1)
+		flush |= (*func)(mm, virt_to_page(pud), PT_PUD);
+	flush |= xen_pud_walk(mm, pud, func, last, limit);
 	return flush;
 }
 
@@ -644,8 +640,6 @@ static int __xen_pgd_walk(struct mm_struct *mm, pgd_t *pgd,
 			continue;
 
 		p4d = p4d_offset(&pgd[i], 0);
-		if (PTRS_PER_P4D > 1)
-			flush |= (*func)(mm, virt_to_page(p4d), PT_P4D);
 		flush |= xen_p4d_walk(mm, p4d, func, i == nr - 1, limit);
 	}
 
@@ -1176,22 +1170,14 @@ static void __init xen_cleanmfnmap(unsigned long vaddr)
 {
 	pgd_t *pgd;
 	p4d_t *p4d;
-	unsigned int i;
 	bool unpin;
 
 	unpin = (vaddr == 2 * PGDIR_SIZE);
 	vaddr &= PMD_MASK;
 	pgd = pgd_offset_k(vaddr);
 	p4d = p4d_offset(pgd, 0);
-	for (i = 0; i < PTRS_PER_P4D; i++) {
-		if (p4d_none(p4d[i]))
-			continue;
-		xen_cleanmfnmap_p4d(p4d + i, unpin);
-	}
-	if (IS_ENABLED(CONFIG_X86_5LEVEL)) {
-		set_pgd(pgd, __pgd(0));
-		xen_cleanmfnmap_free_pgtbl(p4d, unpin);
-	}
+	if (!p4d_none(*p4d))
+		xen_cleanmfnmap_p4d(p4d, unpin);
 }
 
 static void __init xen_pagetable_p2m_free(void)
@@ -1692,7 +1678,7 @@ static void xen_release_pmd(unsigned long pfn)
 	xen_release_ptpage(pfn, PT_PMD);
 }
 
-#if CONFIG_PGTABLE_LEVELS >= 4
+#ifdef CONFIG_X86_64
 static void xen_alloc_pud(struct mm_struct *mm, unsigned long pfn)
 {
 	xen_alloc_ptpage(mm, pfn, PT_PUD);
@@ -2029,13 +2015,12 @@ static phys_addr_t __init xen_early_virt_to_phys(unsigned long vaddr)
  */
 void __init xen_relocate_p2m(void)
 {
-	phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys, p4d_phys;
+	phys_addr_t size, new_area, pt_phys, pmd_phys, pud_phys;
 	unsigned long p2m_pfn, p2m_pfn_end, n_frames, pfn, pfn_end;
-	int n_pte, n_pt, n_pmd, n_pud, n_p4d, idx_pte, idx_pt, idx_pmd, idx_pud, idx_p4d;
+	int n_pte, n_pt, n_pmd, n_pud, idx_pte, idx_pt, idx_pmd, idx_pud;
 	pte_t *pt;
 	pmd_t *pmd;
 	pud_t *pud;
-	p4d_t *p4d = NULL;
 	pgd_t *pgd;
 	unsigned long *new_p2m;
 	int save_pud;
@@ -2045,11 +2030,7 @@ void __init xen_relocate_p2m(void)
 	n_pt = roundup(size, PMD_SIZE) >> PMD_SHIFT;
 	n_pmd = roundup(size, PUD_SIZE) >> PUD_SHIFT;
 	n_pud = roundup(size, P4D_SIZE) >> P4D_SHIFT;
-	if (PTRS_PER_P4D > 1)
-		n_p4d = roundup(size, PGDIR_SIZE) >> PGDIR_SHIFT;
-	else
-		n_p4d = 0;
-	n_frames = n_pte + n_pt + n_pmd + n_pud + n_p4d;
+	n_frames = n_pte + n_pt + n_pmd + n_pud;
 
 	new_area = xen_find_free_area(PFN_PHYS(n_frames));
 	if (!new_area) {
@@ -2065,76 +2046,56 @@ void __init xen_relocate_p2m(void)
 	 * To avoid any possible virtual address collision, just use
 	 * 2 * PUD_SIZE for the new area.
 	 */
-	p4d_phys = new_area;
-	pud_phys = p4d_phys + PFN_PHYS(n_p4d);
+	pud_phys = new_area;
 	pmd_phys = pud_phys + PFN_PHYS(n_pud);
 	pt_phys = pmd_phys + PFN_PHYS(n_pmd);
 	p2m_pfn = PFN_DOWN(pt_phys) + n_pt;
 
 	pgd = __va(read_cr3_pa());
 	new_p2m = (unsigned long *)(2 * PGDIR_SIZE);
-	idx_p4d = 0;
 	save_pud = n_pud;
-	do {
-		if (n_p4d > 0) {
-			p4d = early_memremap(p4d_phys, PAGE_SIZE);
-			clear_page(p4d);
-			n_pud = min(save_pud, PTRS_PER_P4D);
-		}
-		for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
-			pud = early_memremap(pud_phys, PAGE_SIZE);
-			clear_page(pud);
-			for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
-				 idx_pmd++) {
-				pmd = early_memremap(pmd_phys, PAGE_SIZE);
-				clear_page(pmd);
-				for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
-					 idx_pt++) {
-					pt = early_memremap(pt_phys, PAGE_SIZE);
-					clear_page(pt);
-					for (idx_pte = 0;
-						 idx_pte < min(n_pte, PTRS_PER_PTE);
-						 idx_pte++) {
-						set_pte(pt + idx_pte,
-								pfn_pte(p2m_pfn, PAGE_KERNEL));
-						p2m_pfn++;
-					}
-					n_pte -= PTRS_PER_PTE;
-					early_memunmap(pt, PAGE_SIZE);
-					make_lowmem_page_readonly(__va(pt_phys));
-					pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
-							PFN_DOWN(pt_phys));
-					set_pmd(pmd + idx_pt,
-							__pmd(_PAGE_TABLE | pt_phys));
-					pt_phys += PAGE_SIZE;
+	for (idx_pud = 0; idx_pud < n_pud; idx_pud++) {
+		pud = early_memremap(pud_phys, PAGE_SIZE);
+		clear_page(pud);
+		for (idx_pmd = 0; idx_pmd < min(n_pmd, PTRS_PER_PUD);
+				idx_pmd++) {
+			pmd = early_memremap(pmd_phys, PAGE_SIZE);
+			clear_page(pmd);
+			for (idx_pt = 0; idx_pt < min(n_pt, PTRS_PER_PMD);
+					idx_pt++) {
+				pt = early_memremap(pt_phys, PAGE_SIZE);
+				clear_page(pt);
+				for (idx_pte = 0;
+						idx_pte < min(n_pte, PTRS_PER_PTE);
+						idx_pte++) {
+					set_pte(pt + idx_pte,
+							pfn_pte(p2m_pfn, PAGE_KERNEL));
+					p2m_pfn++;
 				}
-				n_pt -= PTRS_PER_PMD;
-				early_memunmap(pmd, PAGE_SIZE);
-				make_lowmem_page_readonly(__va(pmd_phys));
-				pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
-						PFN_DOWN(pmd_phys));
-				set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
-				pmd_phys += PAGE_SIZE;
+				n_pte -= PTRS_PER_PTE;
+				early_memunmap(pt, PAGE_SIZE);
+				make_lowmem_page_readonly(__va(pt_phys));
+				pin_pagetable_pfn(MMUEXT_PIN_L1_TABLE,
+						PFN_DOWN(pt_phys));
+				set_pmd(pmd + idx_pt,
+						__pmd(_PAGE_TABLE | pt_phys));
+				pt_phys += PAGE_SIZE;
 			}
-			n_pmd -= PTRS_PER_PUD;
-			early_memunmap(pud, PAGE_SIZE);
-			make_lowmem_page_readonly(__va(pud_phys));
-			pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
-			if (n_p4d > 0)
-				set_p4d(p4d + idx_pud, __p4d(_PAGE_TABLE | pud_phys));
-			else
-				set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
-			pud_phys += PAGE_SIZE;
-		}
-		if (n_p4d > 0) {
-			save_pud -= PTRS_PER_P4D;
-			early_memunmap(p4d, PAGE_SIZE);
-			make_lowmem_page_readonly(__va(p4d_phys));
-			pin_pagetable_pfn(MMUEXT_PIN_L4_TABLE, PFN_DOWN(p4d_phys));
-			set_pgd(pgd + 2 + idx_p4d, __pgd(_PAGE_TABLE | p4d_phys));
-			p4d_phys += PAGE_SIZE;
+			n_pt -= PTRS_PER_PMD;
+			early_memunmap(pmd, PAGE_SIZE);
+			make_lowmem_page_readonly(__va(pmd_phys));
+			pin_pagetable_pfn(MMUEXT_PIN_L2_TABLE,
+					PFN_DOWN(pmd_phys));
+			set_pud(pud + idx_pmd, __pud(_PAGE_TABLE | pmd_phys));
+			pmd_phys += PAGE_SIZE;
 		}
-	} while (++idx_p4d < n_p4d);
+		n_pmd -= PTRS_PER_PUD;
+		early_memunmap(pud, PAGE_SIZE);
+		make_lowmem_page_readonly(__va(pud_phys));
+		pin_pagetable_pfn(MMUEXT_PIN_L3_TABLE, PFN_DOWN(pud_phys));
+		set_pgd(pgd + 2 + idx_pud, __pgd(_PAGE_TABLE | pud_phys));
+		pud_phys += PAGE_SIZE;
+	}
 
 	/* Now copy the old p2m info to the new area. */
 	memcpy(new_p2m, xen_p2m_addr, size);
@@ -2361,7 +2322,7 @@ static void __init xen_post_allocator_init(void)
 	pv_mmu_ops.set_pte = xen_set_pte;
 	pv_mmu_ops.set_pmd = xen_set_pmd;
 	pv_mmu_ops.set_pud = xen_set_pud;
-#if CONFIG_PGTABLE_LEVELS >= 4
+#ifdef CONFIG_X86_64
 	pv_mmu_ops.set_p4d = xen_set_p4d;
 #endif
 
@@ -2371,7 +2332,7 @@ static void __init xen_post_allocator_init(void)
 	pv_mmu_ops.alloc_pmd = xen_alloc_pmd;
 	pv_mmu_ops.release_pte = xen_release_pte;
 	pv_mmu_ops.release_pmd = xen_release_pmd;
-#if CONFIG_PGTABLE_LEVELS >= 4
+#ifdef CONFIG_X86_64
 	pv_mmu_ops.alloc_pud = xen_alloc_pud;
 	pv_mmu_ops.release_pud = xen_release_pud;
 #endif
@@ -2435,14 +2396,14 @@ static const struct pv_mmu_ops xen_mmu_ops __initconst = {
 	.make_pmd = PV_CALLEE_SAVE(xen_make_pmd),
 	.pmd_val = PV_CALLEE_SAVE(xen_pmd_val),
 
-#if CONFIG_PGTABLE_LEVELS >= 4
+#ifdef CONFIG_X86_64
 	.pud_val = PV_CALLEE_SAVE(xen_pud_val),
 	.make_pud = PV_CALLEE_SAVE(xen_make_pud),
 	.set_p4d = xen_set_p4d_hyper,
 
 	.alloc_pud = xen_alloc_pmd_init,
 	.release_pud = xen_release_pmd_init,
-#endif	/* CONFIG_PGTABLE_LEVELS == 4 */
+#endif	/* CONFIG_X86_64 */
 
 	.activate_mm = xen_activate_mm,
 	.dup_mmap = xen_dup_mmap,
diff --git a/arch/x86/xen/smp_pv.c b/arch/x86/xen/smp_pv.c
index 05f91ce9b55e..c0c756c76afe 100644
--- a/arch/x86/xen/smp_pv.c
+++ b/arch/x86/xen/smp_pv.c
@@ -14,6 +14,7 @@
  * single-threaded.
  */
 #include <linux/sched.h>
+#include <linux/sched/task_stack.h>
 #include <linux/err.h>
 #include <linux/slab.h>
 #include <linux/smp.h>
@@ -294,12 +295,19 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 #endif
 	memset(&ctxt->fpu_ctxt, 0, sizeof(ctxt->fpu_ctxt));
 
+	/*
+	 * Bring up the CPU in cpu_bringup_and_idle() with the stack
+	 * pointing just below where pt_regs would be if it were a normal
+	 * kernel entry.
+	 */
 	ctxt->user_regs.eip = (unsigned long)cpu_bringup_and_idle;
 	ctxt->flags = VGCF_IN_KERNEL;
 	ctxt->user_regs.eflags = 0x1000; /* IOPL_RING1 */
 	ctxt->user_regs.ds = __USER_DS;
 	ctxt->user_regs.es = __USER_DS;
 	ctxt->user_regs.ss = __KERNEL_DS;
+	ctxt->user_regs.cs = __KERNEL_CS;
+	ctxt->user_regs.esp = (unsigned long)task_pt_regs(idle);
 
 	xen_copy_trap_info(ctxt->trap_ctxt);
 
@@ -314,8 +322,13 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 	ctxt->gdt_frames[0] = gdt_mfn;
 	ctxt->gdt_ents      = GDT_ENTRIES;
 
+	/*
+	 * Set SS:SP that Xen will use when entering guest kernel mode
+	 * from guest user mode.  Subsequent calls to load_sp0() can
+	 * change this value.
+	 */
 	ctxt->kernel_ss = __KERNEL_DS;
-	ctxt->kernel_sp = idle->thread.sp0;
+	ctxt->kernel_sp = task_top_of_stack(idle);
 
 #ifdef CONFIG_X86_32
 	ctxt->event_callback_cs     = __KERNEL_CS;
@@ -327,10 +340,8 @@ cpu_initialize_context(unsigned int cpu, struct task_struct *idle)
 		(unsigned long)xen_hypervisor_callback;
 	ctxt->failsafe_callback_eip =
 		(unsigned long)xen_failsafe_callback;
-	ctxt->user_regs.cs = __KERNEL_CS;
 	per_cpu(xen_cr3, cpu) = __pa(swapper_pg_dir);
 
-	ctxt->user_regs.esp = idle->thread.sp0 - sizeof(struct pt_regs);
 	ctxt->ctrlreg[3] = xen_pfn_to_cr3(virt_to_gfn(swapper_pg_dir));
 	if (HYPERVISOR_vcpu_op(VCPUOP_initialise, xen_vcpu_nr(cpu), ctxt))
 		BUG();
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index c98a48c861fd..8a10c9a9e2b5 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -30,7 +30,7 @@ xen_pv_trap debug
 xen_pv_trap xendebug
 xen_pv_trap int3
 xen_pv_trap xenint3
-xen_pv_trap nmi
+xen_pv_trap xennmi
 xen_pv_trap overflow
 xen_pv_trap bounds
 xen_pv_trap invalid_op
diff --git a/arch/x86/xen/xen-head.S b/arch/x86/xen/xen-head.S
index b5b8d7f43557..497cc55a0c16 100644
--- a/arch/x86/xen/xen-head.S
+++ b/arch/x86/xen/xen-head.S
@@ -10,6 +10,7 @@
 #include <asm/boot.h>
 #include <asm/asm.h>
 #include <asm/page_types.h>
+#include <asm/unwind_hints.h>
 
 #include <xen/interface/elfnote.h>
 #include <xen/interface/features.h>
@@ -20,6 +21,7 @@
 #ifdef CONFIG_XEN_PV
 	__INIT
 ENTRY(startup_xen)
+	UNWIND_HINT_EMPTY
 	cld
 
 	/* Clear .bss */
@@ -34,21 +36,24 @@ ENTRY(startup_xen)
 	mov $init_thread_union+THREAD_SIZE, %_ASM_SP
 
 	jmp xen_start_kernel
-
+END(startup_xen)
 	__FINIT
 #endif
 
 .pushsection .text
 	.balign PAGE_SIZE
 ENTRY(hypercall_page)
-	.skip PAGE_SIZE
+	.rept (PAGE_SIZE / 32)
+		UNWIND_HINT_EMPTY
+		.skip 32
+	.endr
 
 #define HYPERCALL(n) \
 	.equ xen_hypercall_##n, hypercall_page + __HYPERVISOR_##n * 32; \
 	.type xen_hypercall_##n, @function; .size xen_hypercall_##n, 32
 #include <asm/xen-hypercalls.h>
 #undef HYPERCALL
-
+END(hypercall_page)
 .popsection
 
 	ELFNOTE(Xen, XEN_ELFNOTE_GUEST_OS,       .asciz "linux")
author	Linus Torvalds <torvalds@linux-foundation.org>	2017-11-13 14:13:48 -0800
committer	Linus Torvalds <torvalds@linux-foundation.org>	2017-11-13 14:13:48 -0800
commit	d6ec9d9a4def52a5094237564eaf6f6979fd7a27 (patch)
tree	adfb80f83f04a021e82cb25227b64b1bb9e793dc /arch/x86
parent	3e2014637c50e5d6a77cd63d5db6c209fe29d1b1 (diff)
parent	91a6a6cfee8ad34ea4cc10a54c0765edfe437cdb (diff)
download	linux-d6ec9d9a4def52a5094237564eaf6f6979fd7a27.tar.gz