summary refs log tree commit diff
path: root/arch
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2015-11-04 11:31:31 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2015-11-04 11:31:31 -0800
commite627078a0cbdc0c391efeb5a2c4eb287328fd633 (patch)
tree9cdabfc9c661ea2ac8801f4611e9541a6411706a /arch
parent14c79092909a52b6fd6394b6ad5e7756c4f9565e (diff)
parentb38feccd663b55ab07116208b68e1ffc7c3c7e78 (diff)
downloadlinux-e627078a0cbdc0c391efeb5a2c4eb287328fd633.tar.gz
Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux
Pull s390 updates from Martin Schwidefsky:
 "There is only one new feature in this pull for the 4.4 merge window,
  most of it is small enhancements, cleanup and bug fixes:

   - Add the s390 backend for the software dirty bit tracking.  This
     adds two new pgtable functions pte_clear_soft_dirty and
     pmd_clear_soft_dirty which is why there is a hit to
     arch/x86/include/asm/pgtable.h in this pull request.

   - A series of cleanup patches for the AP bus, this includes the
     removal of the support for two outdated crypto cards (PCICC and
     PCICA).

   - The irq handling / signaling on buffer full in the runtime
     instrumentation code is dropped.

   - Some micro optimizations: remove unnecessary memory barriers for a
     couple of functions: [smb_]rmb, [smb_]wmb, atomics, bitops, and for
     spin_unlock.  Use the builtin bswap if available and make
     test_and_set_bit_lock more cache friendly.

   - Statistics and a tracepoint for the diagnose calls to the
     hypervisor.

   - The CPU measurement facility support to sample KVM guests is
     improved.

   - The vector instructions are now always enabled for user space
     processes if the hardware has the vector facility.  This simplifies
     the FPU handling code.  The fpu-internal.h header is split into fpu
     internals, api and types just like x86.

   - Cleanup and improvements for the common I/O layer.

   - Rework udelay to solve a problem with kprobe.  udelay has busy loop
     semantics but still uses an idle processor state for the wait"

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/s390/linux: (66 commits)
  s390: remove runtime instrumentation interrupts
  s390/cio: de-duplicate subchannel validation
  s390/css: unneeded initialization in for_each_subchannel
  s390/Kconfig: use builtin bswap
  s390/dasd: fix disconnected device with valid path mask
  s390/dasd: fix invalid PAV assignment after suspend/resume
  s390/dasd: fix double free in dasd_eckd_read_conf
  s390/kernel: fix ptrace peek/poke for floating point registers
  s390/cio: move ccw_device_stlck functions
  s390/cio: move ccw_device_call_handler
  s390/topology: reduce per_cpu() invocations
  s390/nmi: reduce size of percpu variable
  s390/nmi: fix terminology
  s390/nmi: remove casts
  s390/nmi: remove pointless error strings
  s390: don't store registers on disabled wait anymore
  s390: get rid of __set_psw_mask()
  s390/fpu: split fpu-internal.h into fpu internals, api, and type headers
  s390/dasd: fix list_del corruption after lcu changes
  s390/spinlock: remove unneeded serializations at unlock
  ...
Diffstat (limited to 'arch')
-rw-r--r--arch/s390/Kconfig2
-rw-r--r--arch/s390/hypfs/hypfs_diag.c10
-rw-r--r--arch/s390/hypfs/hypfs_diag0c.c2
-rw-r--r--arch/s390/hypfs/hypfs_sprp.c9
-rw-r--r--arch/s390/hypfs/hypfs_vm.c2
-rw-r--r--arch/s390/include/asm/appldata.h2
-rw-r--r--arch/s390/include/asm/atomic.h2
-rw-r--r--arch/s390/include/asm/barrier.h8
-rw-r--r--arch/s390/include/asm/bitops.h55
-rw-r--r--arch/s390/include/asm/cio.h10
-rw-r--r--arch/s390/include/asm/cmb.h1
-rw-r--r--arch/s390/include/asm/cmpxchg.h30
-rw-r--r--arch/s390/include/asm/cpu_mf.h5
-rw-r--r--arch/s390/include/asm/ctl_reg.h2
-rw-r--r--arch/s390/include/asm/diag.h29
-rw-r--r--arch/s390/include/asm/etr.h10
-rw-r--r--arch/s390/include/asm/fpu/api.h30
-rw-r--r--arch/s390/include/asm/fpu/internal.h (renamed from arch/s390/include/asm/fpu-internal.h)51
-rw-r--r--arch/s390/include/asm/fpu/types.h25
-rw-r--r--arch/s390/include/asm/idle.h2
-rw-r--r--arch/s390/include/asm/irq.h14
-rw-r--r--arch/s390/include/asm/kvm_host.h2
-rw-r--r--arch/s390/include/asm/kvm_para.h67
-rw-r--r--arch/s390/include/asm/lowcore.h11
-rw-r--r--arch/s390/include/asm/nmi.h98
-rw-r--r--arch/s390/include/asm/pgtable.h66
-rw-r--r--arch/s390/include/asm/processor.h99
-rw-r--r--arch/s390/include/asm/ptrace.h11
-rw-r--r--arch/s390/include/asm/setup.h50
-rw-r--r--arch/s390/include/asm/spinlock.h3
-rw-r--r--arch/s390/include/asm/switch_to.h2
-rw-r--r--arch/s390/include/asm/thread_info.h22
-rw-r--r--arch/s390/include/asm/trace/diag.h43
-rw-r--r--arch/s390/kernel/Makefile2
-rw-r--r--arch/s390/kernel/asm-offsets.c290
-rw-r--r--arch/s390/kernel/compat_signal.c7
-rw-r--r--arch/s390/kernel/cpcmd.c2
-rw-r--r--arch/s390/kernel/crash_dump.c16
-rw-r--r--arch/s390/kernel/diag.c134
-rw-r--r--arch/s390/kernel/early.c15
-rw-r--r--arch/s390/kernel/entry.S230
-rw-r--r--arch/s390/kernel/entry.h3
-rw-r--r--arch/s390/kernel/head64.S7
-rw-r--r--arch/s390/kernel/ipl.c9
-rw-r--r--arch/s390/kernel/irq.c1
-rw-r--r--arch/s390/kernel/nmi.c120
-rw-r--r--arch/s390/kernel/perf_cpum_sf.c10
-rw-r--r--arch/s390/kernel/process.c37
-rw-r--r--arch/s390/kernel/processor.c5
-rw-r--r--arch/s390/kernel/ptrace.c52
-rw-r--r--arch/s390/kernel/runtime_instr.c64
-rw-r--r--arch/s390/kernel/s390_ksyms.c3
-rw-r--r--arch/s390/kernel/signal.c7
-rw-r--r--arch/s390/kernel/smp.c10
-rw-r--r--arch/s390/kernel/time.c31
-rw-r--r--arch/s390/kernel/topology.c28
-rw-r--r--arch/s390/kernel/trace.c29
-rw-r--r--arch/s390/kernel/traps.c41
-rw-r--r--arch/s390/kernel/vdso.c2
-rw-r--r--arch/s390/kvm/kvm-s390.c6
-rw-r--r--arch/s390/lib/delay.c30
-rw-r--r--arch/s390/lib/find.c4
-rw-r--r--arch/s390/lib/spinlock.c4
-rw-r--r--arch/s390/mm/extmem.c3
-rw-r--r--arch/s390/mm/fault.c7
-rw-r--r--arch/s390/mm/hugetlbpage.c2
-rw-r--r--arch/s390/numa/mode_emu.c10
-rw-r--r--arch/s390/pci/pci_insn.c6
-rw-r--r--arch/x86/include/asm/pgtable.h10
69 files changed, 1146 insertions, 866 deletions
diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig
index 1d57000b1b24..9b9a2db06810 100644
--- a/arch/s390/Kconfig
+++ b/arch/s390/Kconfig
@@ -101,6 +101,7 @@ config S390
 	select ARCH_SAVE_PAGE_KEYS if HIBERNATION
 	select ARCH_SUPPORTS_ATOMIC_RMW
 	select ARCH_SUPPORTS_NUMA_BALANCING
+	select ARCH_USE_BUILTIN_BSWAP
 	select ARCH_USE_CMPXCHG_LOCKREF
 	select ARCH_WANTS_PROT_NUMA_PROT_NONE
 	select ARCH_WANT_IPC_PARSE_VERSION
@@ -118,6 +119,7 @@ config S390
 	select HAVE_ARCH_EARLY_PFN_TO_NID
 	select HAVE_ARCH_JUMP_LABEL
 	select HAVE_ARCH_SECCOMP_FILTER
+	select HAVE_ARCH_SOFT_DIRTY
 	select HAVE_ARCH_TRACEHOOK
 	select HAVE_ARCH_TRANSPARENT_HUGEPAGE
 	select HAVE_BPF_JIT if PACK_STACK && HAVE_MARCH_Z196_FEATURES
diff --git a/arch/s390/hypfs/hypfs_diag.c b/arch/s390/hypfs/hypfs_diag.c
index 5eeffeefae06..045035796ca7 100644
--- a/arch/s390/hypfs/hypfs_diag.c
+++ b/arch/s390/hypfs/hypfs_diag.c
@@ -15,6 +15,7 @@
 #include <linux/string.h>
 #include <linux/vmalloc.h>
 #include <linux/mm.h>
+#include <asm/diag.h>
 #include <asm/ebcdic.h>
 #include "hypfs.h"
 
@@ -336,7 +337,7 @@ static inline __u64 phys_cpu__ctidx(enum diag204_format type, void *hdr)
 
 /* Diagnose 204 functions */
 
-static int diag204(unsigned long subcode, unsigned long size, void *addr)
+static inline int __diag204(unsigned long subcode, unsigned long size, void *addr)
 {
 	register unsigned long _subcode asm("0") = subcode;
 	register unsigned long _size asm("1") = size;
@@ -351,6 +352,12 @@ static int diag204(unsigned long subcode, unsigned long size, void *addr)
 	return _size;
 }
 
+static int diag204(unsigned long subcode, unsigned long size, void *addr)
+{
+	diag_stat_inc(DIAG_STAT_X204);
+	return __diag204(subcode, size, addr);
+}
+
 /*
  * For the old diag subcode 4 with simple data format we have to use real
  * memory. If we use subcode 6 or 7 with extended data format, we can (and
@@ -505,6 +512,7 @@ static int diag224(void *ptr)
 {
 	int rc = -EOPNOTSUPP;
 
+	diag_stat_inc(DIAG_STAT_X224);
 	asm volatile(
 		"	diag	%1,%2,0x224\n"
 		"0:	lhi	%0,0x0\n"
diff --git a/arch/s390/hypfs/hypfs_diag0c.c b/arch/s390/hypfs/hypfs_diag0c.c
index 24c747a0fcc3..0f1927cbba31 100644
--- a/arch/s390/hypfs/hypfs_diag0c.c
+++ b/arch/s390/hypfs/hypfs_diag0c.c
@@ -8,6 +8,7 @@
 
 #include <linux/slab.h>
 #include <linux/cpu.h>
+#include <asm/diag.h>
 #include <asm/hypfs.h>
 #include "hypfs.h"
 
@@ -18,6 +19,7 @@
  */
 static void diag0c(struct hypfs_diag0c_entry *entry)
 {
+	diag_stat_inc(DIAG_STAT_X00C);
 	asm volatile (
 		"	sam31\n"
 		"	diag	%0,%0,0x0c\n"
diff --git a/arch/s390/hypfs/hypfs_sprp.c b/arch/s390/hypfs/hypfs_sprp.c
index dd42a26d049d..c9e5c72f78bd 100644
--- a/arch/s390/hypfs/hypfs_sprp.c
+++ b/arch/s390/hypfs/hypfs_sprp.c
@@ -13,6 +13,7 @@
 #include <linux/types.h>
 #include <linux/uaccess.h>
 #include <asm/compat.h>
+#include <asm/diag.h>
 #include <asm/sclp.h>
 #include "hypfs.h"
 
@@ -22,7 +23,7 @@
 
 #define DIAG304_CMD_MAX		2
 
-static unsigned long hypfs_sprp_diag304(void *data, unsigned long cmd)
+static inline unsigned long __hypfs_sprp_diag304(void *data, unsigned long cmd)
 {
 	register unsigned long _data asm("2") = (unsigned long) data;
 	register unsigned long _rc asm("3");
@@ -34,6 +35,12 @@ static unsigned long hypfs_sprp_diag304(void *data, unsigned long cmd)
 	return _rc;
 }
 
+static unsigned long hypfs_sprp_diag304(void *data, unsigned long cmd)
+{
+	diag_stat_inc(DIAG_STAT_X304);
+	return __hypfs_sprp_diag304(data, cmd);
+}
+
 static void hypfs_sprp_free(const void *data)
 {
 	free_page((unsigned long) data);
diff --git a/arch/s390/hypfs/hypfs_vm.c b/arch/s390/hypfs/hypfs_vm.c
index afbe07907c10..44feac38ccfc 100644
--- a/arch/s390/hypfs/hypfs_vm.c
+++ b/arch/s390/hypfs/hypfs_vm.c
@@ -9,6 +9,7 @@
 #include <linux/errno.h>
 #include <linux/string.h>
 #include <linux/vmalloc.h>
+#include <asm/diag.h>
 #include <asm/ebcdic.h>
 #include <asm/timex.h>
 #include "hypfs.h"
@@ -66,6 +67,7 @@ static int diag2fc(int size, char* query, void *addr)
 	memset(parm_list.aci_grp, 0x40, NAME_LEN);
 	rc = -1;
 
+	diag_stat_inc(DIAG_STAT_X2FC);
 	asm volatile(
 		"	diag    %0,%1,0x2fc\n"
 		"0:\n"
diff --git a/arch/s390/include/asm/appldata.h b/arch/s390/include/asm/appldata.h
index 16887c5fd989..a6263d4e8e56 100644
--- a/arch/s390/include/asm/appldata.h
+++ b/arch/s390/include/asm/appldata.h
@@ -7,6 +7,7 @@
 #ifndef _ASM_S390_APPLDATA_H
 #define _ASM_S390_APPLDATA_H
 
+#include <asm/diag.h>
 #include <asm/io.h>
 
 #define APPLDATA_START_INTERVAL_REC	0x80
@@ -53,6 +54,7 @@ static inline int appldata_asm(struct appldata_product_id *id,
 	parm_list.buffer_length = length;
 	parm_list.product_id_addr = (unsigned long) id;
 	parm_list.buffer_addr = virt_to_phys(buffer);
+	diag_stat_inc(DIAG_STAT_X0DC);
 	asm volatile(
 		"	diag	%1,%0,0xdc"
 		: "=d" (ry)
diff --git a/arch/s390/include/asm/atomic.h b/arch/s390/include/asm/atomic.h
index 117fa5c921c1..911064aa59b2 100644
--- a/arch/s390/include/asm/atomic.h
+++ b/arch/s390/include/asm/atomic.h
@@ -36,7 +36,6 @@
 									\
 	typecheck(atomic_t *, ptr);					\
 	asm volatile(							\
-		__barrier						\
 		op_string "	%0,%2,%1\n"				\
 		__barrier						\
 		: "=d" (old_val), "+Q" ((ptr)->counter)			\
@@ -180,7 +179,6 @@ static inline int __atomic_add_unless(atomic_t *v, int a, int u)
 									\
 	typecheck(atomic64_t *, ptr);					\
 	asm volatile(							\
-		__barrier						\
 		op_string "	%0,%2,%1\n"				\
 		__barrier						\
 		: "=d" (old_val), "+Q" ((ptr)->counter)			\
diff --git a/arch/s390/include/asm/barrier.h b/arch/s390/include/asm/barrier.h
index d48fe0162331..d68e11e0df5e 100644
--- a/arch/s390/include/asm/barrier.h
+++ b/arch/s390/include/asm/barrier.h
@@ -22,10 +22,10 @@
 
 #define mb() do {  asm volatile(__ASM_BARRIER : : : "memory"); } while (0)
 
-#define rmb()				mb()
-#define wmb()				mb()
-#define dma_rmb()			rmb()
-#define dma_wmb()			wmb()
+#define rmb()				barrier()
+#define wmb()				barrier()
+#define dma_rmb()			mb()
+#define dma_wmb()			mb()
 #define smp_mb()			mb()
 #define smp_rmb()			rmb()
 #define smp_wmb()			wmb()
diff --git a/arch/s390/include/asm/bitops.h b/arch/s390/include/asm/bitops.h
index 9b68e98a724f..8043f10da6b5 100644
--- a/arch/s390/include/asm/bitops.h
+++ b/arch/s390/include/asm/bitops.h
@@ -11,30 +11,25 @@
  * big-endian system because, unlike little endian, the number of each
  * bit depends on the word size.
  *
- * The bitop functions are defined to work on unsigned longs, so for an
- * s390x system the bits end up numbered:
+ * The bitop functions are defined to work on unsigned longs, so the bits
+ * end up numbered:
  *   |63..............0|127............64|191...........128|255...........192|
- * and on s390:
- *   |31.....0|63....32|95....64|127...96|159..128|191..160|223..192|255..224|
  *
  * There are a few little-endian macros used mostly for filesystem
- * bitmaps, these work on similar bit arrays layouts, but
- * byte-oriented:
+ * bitmaps, these work on similar bit array layouts, but byte-oriented:
  *   |7...0|15...8|23...16|31...24|39...32|47...40|55...48|63...56|
  *
- * The main difference is that bit 3-5 (64b) or 3-4 (32b) in the bit
- * number field needs to be reversed compared to the big-endian bit
- * fields. This can be achieved by XOR with 0x38 (64b) or 0x18 (32b).
+ * The main difference is that bit 3-5 in the bit number field needs to be
+ * reversed compared to the big-endian bit fields. This can be achieved by
+ * XOR with 0x38.
  *
- * We also have special functions which work with an MSB0 encoding:
- * on an s390x system the bits are numbered:
+ * We also have special functions which work with an MSB0 encoding.
+ * The bits are numbered:
  *   |0..............63|64............127|128...........191|192...........255|
- * and on s390:
- *   |0.....31|32....63|64....95|96...127|128..159|160..191|192..223|224..255|
  *
- * The main difference is that bit 0-63 (64b) or 0-31 (32b) in the bit
- * number field needs to be reversed compared to the LSB0 encoded bit
- * fields. This can be achieved by XOR with 0x3f (64b) or 0x1f (32b).
+ * The main difference is that bit 0-63 in the bit number field needs to be
+ * reversed compared to the LSB0 encoded bit fields. This can be achieved by
+ * XOR with 0x3f.
  *
  */
 
@@ -64,7 +59,6 @@
 								\
 	typecheck(unsigned long *, (__addr));			\
 	asm volatile(						\
-		__barrier					\
 		__op_string "	%0,%2,%1\n"			\
 		__barrier					\
 		: "=d" (__old),	"+Q" (*(__addr))		\
@@ -276,12 +270,32 @@ static inline int test_bit(unsigned long nr, const volatile unsigned long *ptr)
 	return (*addr >> (nr & 7)) & 1;
 }
 
+static inline int test_and_set_bit_lock(unsigned long nr,
+					volatile unsigned long *ptr)
+{
+	if (test_bit(nr, ptr))
+		return 1;
+	return test_and_set_bit(nr, ptr);
+}
+
+static inline void clear_bit_unlock(unsigned long nr,
+				    volatile unsigned long *ptr)
+{
+	smp_mb__before_atomic();
+	clear_bit(nr, ptr);
+}
+
+static inline void __clear_bit_unlock(unsigned long nr,
+				      volatile unsigned long *ptr)
+{
+	smp_mb();
+	__clear_bit(nr, ptr);
+}
+
 /*
  * Functions which use MSB0 bit numbering.
- * On an s390x system the bits are numbered:
+ * The bits are numbered:
  *   |0..............63|64............127|128...........191|192...........255|
- * and on s390:
- *   |0.....31|32....63|64....95|96...127|128..159|160..191|192..223|224..255|
  */
 unsigned long find_first_bit_inv(const unsigned long *addr, unsigned long size);
 unsigned long find_next_bit_inv(const unsigned long *addr, unsigned long size,
@@ -446,7 +460,6 @@ static inline int fls(int word)
 #include <asm-generic/bitops/ffz.h>
 #include <asm-generic/bitops/find.h>
 #include <asm-generic/bitops/hweight.h>
-#include <asm-generic/bitops/lock.h>
 #include <asm-generic/bitops/sched.h>
 #include <asm-generic/bitops/le.h>
 #include <asm-generic/bitops/ext2-atomic-setbit.h>
diff --git a/arch/s390/include/asm/cio.h b/arch/s390/include/asm/cio.h
index 096339207764..0c5d8ee657f0 100644
--- a/arch/s390/include/asm/cio.h
+++ b/arch/s390/include/asm/cio.h
@@ -5,6 +5,7 @@
 #define _ASM_S390_CIO_H_
 
 #include <linux/spinlock.h>
+#include <linux/bitops.h>
 #include <asm/types.h>
 
 #define LPM_ANYPATH 0xff
@@ -296,6 +297,15 @@ static inline int ccw_dev_id_is_equal(struct ccw_dev_id *dev_id1,
 	return 0;
 }
 
+/**
+ * pathmask_to_pos() - find the position of the left-most bit in a pathmask
+ * @mask: pathmask with at least one bit set
+ */
+static inline u8 pathmask_to_pos(u8 mask)
+{
+	return 8 - ffs(mask);
+}
+
 void channel_subsystem_reinit(void);
 extern void css_schedule_reprobe(void);
 
diff --git a/arch/s390/include/asm/cmb.h b/arch/s390/include/asm/cmb.h
index 806eac12e3bd..ed2630c23f90 100644
--- a/arch/s390/include/asm/cmb.h
+++ b/arch/s390/include/asm/cmb.h
@@ -6,6 +6,7 @@
 struct ccw_device;
 extern int enable_cmf(struct ccw_device *cdev);
 extern int disable_cmf(struct ccw_device *cdev);
+extern int __disable_cmf(struct ccw_device *cdev);
 extern u64 cmf_read(struct ccw_device *cdev, int index);
 extern int cmf_readall(struct ccw_device *cdev, struct cmbdata *data);
 
diff --git a/arch/s390/include/asm/cmpxchg.h b/arch/s390/include/asm/cmpxchg.h
index 411464f4c97a..24ea6948e32b 100644
--- a/arch/s390/include/asm/cmpxchg.h
+++ b/arch/s390/include/asm/cmpxchg.h
@@ -32,7 +32,7 @@
 	__old;								\
 })
 
-#define __cmpxchg_double_op(p1, p2, o1, o2, n1, n2, insn)		\
+#define __cmpxchg_double(p1, p2, o1, o2, n1, n2)			\
 ({									\
 	register __typeof__(*(p1)) __old1 asm("2") = (o1);		\
 	register __typeof__(*(p2)) __old2 asm("3") = (o2);		\
@@ -40,7 +40,7 @@
 	register __typeof__(*(p2)) __new2 asm("5") = (n2);		\
 	int cc;								\
 	asm volatile(							\
-			insn   " %[old],%[new],%[ptr]\n"		\
+		"	cdsg	%[old],%[new],%[ptr]\n"			\
 		"	ipm	%[cc]\n"				\
 		"	srl	%[cc],28"				\
 		: [cc] "=d" (cc), [old] "+d" (__old1), "+d" (__old2)	\
@@ -50,30 +50,6 @@
 	!cc;								\
 })
 
-#define __cmpxchg_double_4(p1, p2, o1, o2, n1, n2) \
-	__cmpxchg_double_op(p1, p2, o1, o2, n1, n2, "cds")
-
-#define __cmpxchg_double_8(p1, p2, o1, o2, n1, n2) \
-	__cmpxchg_double_op(p1, p2, o1, o2, n1, n2, "cdsg")
-
-extern void __cmpxchg_double_called_with_bad_pointer(void);
-
-#define __cmpxchg_double(p1, p2, o1, o2, n1, n2)			\
-({									\
-	int __ret;							\
-	switch (sizeof(*(p1))) {					\
-	case 4:								\
-		__ret = __cmpxchg_double_4(p1, p2, o1, o2, n1, n2);	\
-		break;							\
-	case 8:								\
-		__ret = __cmpxchg_double_8(p1, p2, o1, o2, n1, n2);	\
-		break;							\
-	default:							\
-		__cmpxchg_double_called_with_bad_pointer();		\
-	}								\
-	__ret;								\
-})
-
 #define cmpxchg_double(p1, p2, o1, o2, n1, n2)				\
 ({									\
 	__typeof__(p1) __p1 = (p1);					\
@@ -81,7 +57,7 @@ extern void __cmpxchg_double_called_with_bad_pointer(void);
 	BUILD_BUG_ON(sizeof(*(p1)) != sizeof(long));			\
 	BUILD_BUG_ON(sizeof(*(p2)) != sizeof(long));			\
 	VM_BUG_ON((unsigned long)((__p1) + 1) != (unsigned long)(__p2));\
-	__cmpxchg_double_8(__p1, __p2, o1, o2, n1, n2);			\
+	__cmpxchg_double(__p1, __p2, o1, o2, n1, n2);			\
 })
 
 #define system_has_cmpxchg_double()	1
diff --git a/arch/s390/include/asm/cpu_mf.h b/arch/s390/include/asm/cpu_mf.h
index 5243a8679a1d..9dd04b9e9782 100644
--- a/arch/s390/include/asm/cpu_mf.h
+++ b/arch/s390/include/asm/cpu_mf.h
@@ -22,15 +22,10 @@
 #define CPU_MF_INT_SF_LSDA	(1 << 22)	/* loss of sample data alert */
 #define CPU_MF_INT_CF_CACA	(1 <<  7)	/* counter auth. change alert */
 #define CPU_MF_INT_CF_LCDA	(1 <<  6)	/* loss of counter data alert */
-#define CPU_MF_INT_RI_HALTED	(1 <<  5)	/* run-time instr. halted */
-#define CPU_MF_INT_RI_BUF_FULL	(1 <<  4)	/* run-time instr. program
-						   buffer full */
-
 #define CPU_MF_INT_CF_MASK	(CPU_MF_INT_CF_CACA|CPU_MF_INT_CF_LCDA)
 #define CPU_MF_INT_SF_MASK	(CPU_MF_INT_SF_IAE|CPU_MF_INT_SF_ISE|	\
 				 CPU_MF_INT_SF_PRA|CPU_MF_INT_SF_SACA|	\
 				 CPU_MF_INT_SF_LSDA)
-#define CPU_MF_INT_RI_MASK	(CPU_MF_INT_RI_HALTED|CPU_MF_INT_RI_BUF_FULL)
 
 /* CPU measurement facility support */
 static inline int cpum_cf_avail(void)
diff --git a/arch/s390/include/asm/ctl_reg.h b/arch/s390/include/asm/ctl_reg.h
index 17a373576868..d7697ab802f6 100644
--- a/arch/s390/include/asm/ctl_reg.h
+++ b/arch/s390/include/asm/ctl_reg.h
@@ -46,8 +46,6 @@ static inline void __ctl_clear_bit(unsigned int cr, unsigned int bit)
 	__ctl_load(reg, cr, cr);
 }
 
-void __ctl_set_vx(void);
-
 void smp_ctl_set_bit(int cr, int bit);
 void smp_ctl_clear_bit(int cr, int bit);
 
diff --git a/arch/s390/include/asm/diag.h b/arch/s390/include/asm/diag.h
index 7e91c58072e2..5fac921c1c42 100644
--- a/arch/s390/include/asm/diag.h
+++ b/arch/s390/include/asm/diag.h
@@ -8,6 +8,34 @@
 #ifndef _ASM_S390_DIAG_H
 #define _ASM_S390_DIAG_H
 
+#include <linux/percpu.h>
+
+enum diag_stat_enum {
+	DIAG_STAT_X008,
+	DIAG_STAT_X00C,
+	DIAG_STAT_X010,
+	DIAG_STAT_X014,
+	DIAG_STAT_X044,
+	DIAG_STAT_X064,
+	DIAG_STAT_X09C,
+	DIAG_STAT_X0DC,
+	DIAG_STAT_X204,
+	DIAG_STAT_X210,
+	DIAG_STAT_X224,
+	DIAG_STAT_X250,
+	DIAG_STAT_X258,
+	DIAG_STAT_X288,
+	DIAG_STAT_X2C4,
+	DIAG_STAT_X2FC,
+	DIAG_STAT_X304,
+	DIAG_STAT_X308,
+	DIAG_STAT_X500,
+	NR_DIAG_STAT
+};
+
+void diag_stat_inc(enum diag_stat_enum nr);
+void diag_stat_inc_norecursion(enum diag_stat_enum nr);
+
 /*
  * Diagnose 10: Release page range
  */
@@ -18,6 +46,7 @@ static inline void diag10_range(unsigned long start_pfn, unsigned long num_pfn)
 	start_addr = start_pfn << PAGE_SHIFT;
 	end_addr = (start_pfn + num_pfn - 1) << PAGE_SHIFT;
 
+	diag_stat_inc(DIAG_STAT_X010);
 	asm volatile(
 		"0:	diag	%0,%1,0x10\n"
 		"1:\n"
diff --git a/arch/s390/include/asm/etr.h b/arch/s390/include/asm/etr.h
index f7e5c36688c3..105f90e63a0e 100644
--- a/arch/s390/include/asm/etr.h
+++ b/arch/s390/include/asm/etr.h
@@ -211,8 +211,9 @@ static inline int etr_ptff(void *ptff_block, unsigned int func)
 #define ETR_PTFF_SGS	0x43	/* set gross steering rate */
 
 /* Functions needed by the machine check handler */
-void etr_switch_to_local(void);
-void etr_sync_check(void);
+int etr_switch_to_local(void);
+int etr_sync_check(void);
+void etr_queue_work(void);
 
 /* notifier for syncs */
 extern struct atomic_notifier_head s390_epoch_delta_notifier;
@@ -253,7 +254,8 @@ struct stp_sstpi {
 } __attribute__ ((packed));
 
 /* Functions needed by the machine check handler */
-void stp_sync_check(void);
-void stp_island_check(void);
+int stp_sync_check(void);
+int stp_island_check(void);
+void stp_queue_work(void);
 
 #endif /* __S390_ETR_H */
diff --git a/arch/s390/include/asm/fpu/api.h b/arch/s390/include/asm/fpu/api.h
new file mode 100644
index 000000000000..5e04f3cbd320
--- /dev/null
+++ b/arch/s390/include/asm/fpu/api.h
@@ -0,0 +1,30 @@
+/*
+ * In-kernel FPU support functions
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+
+#ifndef _ASM_S390_FPU_API_H
+#define _ASM_S390_FPU_API_H
+
+void save_fpu_regs(void);
+
+static inline int test_fp_ctl(u32 fpc)
+{
+	u32 orig_fpc;
+	int rc;
+
+	asm volatile(
+		"	efpc    %1\n"
+		"	sfpc	%2\n"
+		"0:	sfpc	%1\n"
+		"	la	%0,0\n"
+		"1:\n"
+		EX_TABLE(0b,1b)
+		: "=d" (rc), "=d" (orig_fpc)
+		: "d" (fpc), "0" (-EINVAL));
+	return rc;
+}
+
+#endif /* _ASM_S390_FPU_API_H */
diff --git a/arch/s390/include/asm/fpu-internal.h b/arch/s390/include/asm/fpu/internal.h
index 55dc2c0fb40a..2559b16da525 100644
--- a/arch/s390/include/asm/fpu-internal.h
+++ b/arch/s390/include/asm/fpu/internal.h
@@ -1,5 +1,5 @@
 /*
- * General floating pointer and vector register helpers
+ * FPU state and register content conversion primitives
  *
  * Copyright IBM Corp. 2015
  * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
@@ -8,50 +8,9 @@
 #ifndef _ASM_S390_FPU_INTERNAL_H
 #define _ASM_S390_FPU_INTERNAL_H
 
-#define FPU_USE_VX		1	/* Vector extension is active */
-
-#ifndef __ASSEMBLY__
-
-#include <linux/errno.h>
 #include <linux/string.h>
-#include <asm/linkage.h>
 #include <asm/ctl_reg.h>
-#include <asm/sigcontext.h>
-
-struct fpu {
-	__u32 fpc;			/* Floating-point control */
-	__u32 flags;
-	union {
-		void *regs;
-		freg_t *fprs;		/* Floating-point register save area */
-		__vector128 *vxrs;	/* Vector register save area */
-	};
-};
-
-void save_fpu_regs(void);
-
-#define is_vx_fpu(fpu) (!!((fpu)->flags & FPU_USE_VX))
-#define is_vx_task(tsk) (!!((tsk)->thread.fpu.flags & FPU_USE_VX))
-
-/* VX array structure for address operand constraints in inline assemblies */
-struct vx_array { __vector128 _[__NUM_VXRS]; };
-
-static inline int test_fp_ctl(u32 fpc)
-{
-	u32 orig_fpc;
-	int rc;
-
-	asm volatile(
-		"	efpc    %1\n"
-		"	sfpc	%2\n"
-		"0:	sfpc	%1\n"
-		"	la	%0,0\n"
-		"1:\n"
-		EX_TABLE(0b,1b)
-		: "=d" (rc), "=d" (orig_fpc)
-		: "d" (fpc), "0" (-EINVAL));
-	return rc;
-}
+#include <asm/fpu/types.h>
 
 static inline void save_vx_regs_safe(__vector128 *vxrs)
 {
@@ -89,7 +48,7 @@ static inline void convert_fp_to_vx(__vector128 *vxrs, freg_t *fprs)
 static inline void fpregs_store(_s390_fp_regs *fpregs, struct fpu *fpu)
 {
 	fpregs->pad = 0;
-	if (is_vx_fpu(fpu))
+	if (MACHINE_HAS_VX)
 		convert_vx_to_fp((freg_t *)&fpregs->fprs, fpu->vxrs);
 	else
 		memcpy((freg_t *)&fpregs->fprs, fpu->fprs,
@@ -98,13 +57,11 @@ static inline void fpregs_store(_s390_fp_regs *fpregs, struct fpu *fpu)
 
 static inline void fpregs_load(_s390_fp_regs *fpregs, struct fpu *fpu)
 {
-	if (is_vx_fpu(fpu))
+	if (MACHINE_HAS_VX)
 		convert_fp_to_vx(fpu->vxrs, (freg_t *)&fpregs->fprs);
 	else
 		memcpy(fpu->fprs, (freg_t *)&fpregs->fprs,
 		       sizeof(fpregs->fprs));
 }
 
-#endif
-
 #endif /* _ASM_S390_FPU_INTERNAL_H */
diff --git a/arch/s390/include/asm/fpu/types.h b/arch/s390/include/asm/fpu/types.h
new file mode 100644
index 000000000000..14a8b0c14f87
--- /dev/null
+++ b/arch/s390/include/asm/fpu/types.h
@@ -0,0 +1,25 @@
+/*
+ * FPU data structures
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Hendrik Brueckner <brueckner@linux.vnet.ibm.com>
+ */
+
+#ifndef _ASM_S390_FPU_TYPES_H
+#define _ASM_S390_FPU_TYPES_H
+
+#include <asm/sigcontext.h>
+
+struct fpu {
+	__u32 fpc;			/* Floating-point control */
+	union {
+		void *regs;
+		freg_t *fprs;		/* Floating-point register save area */
+		__vector128 *vxrs;	/* Vector register save area */
+	};
+};
+
+/* VX array structure for address operand constraints in inline assemblies */
+struct vx_array { __vector128 _[__NUM_VXRS]; };
+
+#endif /* _ASM_S390_FPU_TYPES_H */
diff --git a/arch/s390/include/asm/idle.h b/arch/s390/include/asm/idle.h
index 113cd963dbbe..51ff96d9f287 100644
--- a/arch/s390/include/asm/idle.h
+++ b/arch/s390/include/asm/idle.h
@@ -24,4 +24,6 @@ struct s390_idle_data {
 extern struct device_attribute dev_attr_idle_count;
 extern struct device_attribute dev_attr_idle_time_us;
 
+void psw_idle(struct s390_idle_data *, unsigned long);
+
 #endif /* _S390_IDLE_H */
diff --git a/arch/s390/include/asm/irq.h b/arch/s390/include/asm/irq.h
index ff95d15a2384..f97b055de76a 100644
--- a/arch/s390/include/asm/irq.h
+++ b/arch/s390/include/asm/irq.h
@@ -47,7 +47,6 @@ enum interruption_class {
 	IRQEXT_IUC,
 	IRQEXT_CMS,
 	IRQEXT_CMC,
-	IRQEXT_CMR,
 	IRQEXT_FTP,
 	IRQIO_CIO,
 	IRQIO_QAI,
@@ -96,6 +95,19 @@ enum irq_subclass {
 	IRQ_SUBCLASS_SERVICE_SIGNAL = 9,
 };
 
+#define CR0_IRQ_SUBCLASS_MASK					  \
+	((1UL << (63 - 30))  /* Warning Track */		| \
+	 (1UL << (63 - 48))  /* Malfunction Alert */		| \
+	 (1UL << (63 - 49))  /* Emergency Signal */		| \
+	 (1UL << (63 - 50))  /* External Call */		| \
+	 (1UL << (63 - 52))  /* Clock Comparator */		| \
+	 (1UL << (63 - 53))  /* CPU Timer */			| \
+	 (1UL << (63 - 54))  /* Service Signal */		| \
+	 (1UL << (63 - 57))  /* Interrupt Key */		| \
+	 (1UL << (63 - 58))  /* Measurement Alert */		| \
+	 (1UL << (63 - 59))  /* Timing Alert */			| \
+	 (1UL << (63 - 62))) /* IUCV */
+
 void irq_subclass_register(enum irq_subclass subclass);
 void irq_subclass_unregister(enum irq_subclass subclass);
 
diff --git a/arch/s390/include/asm/kvm_host.h b/arch/s390/include/asm/kvm_host.h
index 8ced426091e1..7f654308817c 100644
--- a/arch/s390/include/asm/kvm_host.h
+++ b/arch/s390/include/asm/kvm_host.h
@@ -22,7 +22,7 @@
 #include <linux/kvm.h>
 #include <asm/debug.h>
 #include <asm/cpu.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/api.h>
 #include <asm/isc.h>
 
 #define KVM_MAX_VCPUS 64
diff --git a/arch/s390/include/asm/kvm_para.h b/arch/s390/include/asm/kvm_para.h
index e0f842308a68..41393052ac57 100644
--- a/arch/s390/include/asm/kvm_para.h
+++ b/arch/s390/include/asm/kvm_para.h
@@ -27,10 +27,9 @@
 #define __S390_KVM_PARA_H
 
 #include <uapi/asm/kvm_para.h>
+#include <asm/diag.h>
 
-
-
-static inline long kvm_hypercall0(unsigned long nr)
+static inline long __kvm_hypercall0(unsigned long nr)
 {
 	register unsigned long __nr asm("1") = nr;
 	register long __rc asm("2");
@@ -40,7 +39,13 @@ static inline long kvm_hypercall0(unsigned long nr)
 	return __rc;
 }
 
-static inline long kvm_hypercall1(unsigned long nr, unsigned long p1)
+static inline long kvm_hypercall0(unsigned long nr)
+{
+	diag_stat_inc(DIAG_STAT_X500);
+	return __kvm_hypercall0(nr);
+}
+
+static inline long __kvm_hypercall1(unsigned long nr, unsigned long p1)
 {
 	register unsigned long __nr asm("1") = nr;
 	register unsigned long __p1 asm("2") = p1;
@@ -51,7 +56,13 @@ static inline long kvm_hypercall1(unsigned long nr, unsigned long p1)
 	return __rc;
 }
 
-static inline long kvm_hypercall2(unsigned long nr, unsigned long p1,
+static inline long kvm_hypercall1(unsigned long nr, unsigned long p1)
+{
+	diag_stat_inc(DIAG_STAT_X500);
+	return __kvm_hypercall1(nr, p1);
+}
+
+static inline long __kvm_hypercall2(unsigned long nr, unsigned long p1,
 			       unsigned long p2)
 {
 	register unsigned long __nr asm("1") = nr;
@@ -65,7 +76,14 @@ static inline long kvm_hypercall2(unsigned long nr, unsigned long p1,
 	return __rc;
 }
 
-static inline long kvm_hypercall3(unsigned long nr, unsigned long p1,
+static inline long kvm_hypercall2(unsigned long nr, unsigned long p1,
+			       unsigned long p2)
+{
+	diag_stat_inc(DIAG_STAT_X500);
+	return __kvm_hypercall2(nr, p1, p2);
+}
+
+static inline long __kvm_hypercall3(unsigned long nr, unsigned long p1,
 			       unsigned long p2, unsigned long p3)
 {
 	register unsigned long __nr asm("1") = nr;
@@ -80,8 +98,14 @@ static inline long kvm_hypercall3(unsigned long nr, unsigned long p1,
 	return __rc;
 }
 
+static inline long kvm_hypercall3(unsigned long nr, unsigned long p1,
+			       unsigned long p2, unsigned long p3)
+{
+	diag_stat_inc(DIAG_STAT_X500);
+	return __kvm_hypercall3(nr, p1, p2, p3);
+}
 
-static inline long kvm_hypercall4(unsigned long nr, unsigned long p1,
+static inline long __kvm_hypercall4(unsigned long nr, unsigned long p1,
 			       unsigned long p2, unsigned long p3,
 			       unsigned long p4)
 {
@@ -98,7 +122,15 @@ static inline long kvm_hypercall4(unsigned long nr, unsigned long p1,
 	return __rc;
 }
 
-static inline long kvm_hypercall5(unsigned long nr, unsigned long p1,
+static inline long kvm_hypercall4(unsigned long nr, unsigned long p1,
+			       unsigned long p2, unsigned long p3,
+			       unsigned long p4)
+{
+	diag_stat_inc(DIAG_STAT_X500);
+	return __kvm_hypercall4(nr, p1, p2, p3, p4);
+}
+
+static inline long __kvm_hypercall5(unsigned long nr, unsigned long p1,
 			       unsigned long p2, unsigned long p3,
 			       unsigned long p4, unsigned long p5)
 {
@@ -116,7 +148,15 @@ static inline long kvm_hypercall5(unsigned long nr, unsigned long p1,
 	return __rc;
 }
 
-static inline long kvm_hypercall6(unsigned long nr, unsigned long p1,
+static inline long kvm_hypercall5(unsigned long nr, unsigned long p1,
+			       unsigned long p2, unsigned long p3,
+			       unsigned long p4, unsigned long p5)
+{
+	diag_stat_inc(DIAG_STAT_X500);
+	return __kvm_hypercall5(nr, p1, p2, p3, p4, p5);
+}
+
+static inline long __kvm_hypercall6(unsigned long nr, unsigned long p1,
 			       unsigned long p2, unsigned long p3,
 			       unsigned long p4, unsigned long p5,
 			       unsigned long p6)
@@ -137,6 +177,15 @@ static inline long kvm_hypercall6(unsigned long nr, unsigned long p1,
 	return __rc;
 }
 
+static inline long kvm_hypercall6(unsigned long nr, unsigned long p1,
+			       unsigned long p2, unsigned long p3,
+			       unsigned long p4, unsigned long p5,
+			       unsigned long p6)
+{
+	diag_stat_inc(DIAG_STAT_X500);
+	return __kvm_hypercall6(nr, p1, p2, p3, p4, p5, p6);
+}
+
 /* kvm on s390 is always paravirtualization enabled */
 static inline int kvm_para_available(void)
 {
diff --git a/arch/s390/include/asm/lowcore.h b/arch/s390/include/asm/lowcore.h
index 663f23e37460..afe1cfebf1a4 100644
--- a/arch/s390/include/asm/lowcore.h
+++ b/arch/s390/include/asm/lowcore.h
@@ -67,7 +67,7 @@ struct _lowcore {
 	__u8	pad_0x00c4[0x00c8-0x00c4];	/* 0x00c4 */
 	__u32	stfl_fac_list;			/* 0x00c8 */
 	__u8	pad_0x00cc[0x00e8-0x00cc];	/* 0x00cc */
-	__u32	mcck_interruption_code[2];	/* 0x00e8 */
+	__u64	mcck_interruption_code;		/* 0x00e8 */
 	__u8	pad_0x00f0[0x00f4-0x00f0];	/* 0x00f0 */
 	__u32	external_damage_code;		/* 0x00f4 */
 	__u64	failing_storage_address;	/* 0x00f8 */
@@ -132,7 +132,14 @@ struct _lowcore {
 	/* Address space pointer. */
 	__u64	kernel_asce;			/* 0x0358 */
 	__u64	user_asce;			/* 0x0360 */
-	__u64	current_pid;			/* 0x0368 */
+
+	/*
+	 * The lpp and current_pid fields form a
+	 * 64-bit value that is set as program
+	 * parameter with the LPP instruction.
+	 */
+	__u32	lpp;				/* 0x0368 */
+	__u32	current_pid;			/* 0x036c */
 
 	/* SMP info area */
 	__u32	cpu_nr;				/* 0x0370 */
diff --git a/arch/s390/include/asm/nmi.h b/arch/s390/include/asm/nmi.h
index 3027a5a72b74..b75fd910386a 100644
--- a/arch/s390/include/asm/nmi.h
+++ b/arch/s390/include/asm/nmi.h
@@ -11,51 +11,62 @@
 #ifndef _ASM_S390_NMI_H
 #define _ASM_S390_NMI_H
 
+#include <linux/const.h>
 #include <linux/types.h>
 
-struct mci {
-	__u32 sd :  1; /* 00 system damage */
-	__u32 pd :  1; /* 01 instruction-processing damage */
-	__u32 sr :  1; /* 02 system recovery */
-	__u32	 :  1; /* 03 */
-	__u32 cd :  1; /* 04 timing-facility damage */
-	__u32 ed :  1; /* 05 external damage */
-	__u32	 :  1; /* 06 */
-	__u32 dg :  1; /* 07 degradation */
-	__u32 w  :  1; /* 08 warning pending */
-	__u32 cp :  1; /* 09 channel-report pending */
-	__u32 sp :  1; /* 10 service-processor damage */
-	__u32 ck :  1; /* 11 channel-subsystem damage */
-	__u32	 :  2; /* 12-13 */
-	__u32 b  :  1; /* 14 backed up */
-	__u32	 :  1; /* 15 */
-	__u32 se :  1; /* 16 storage error uncorrected */
-	__u32 sc :  1; /* 17 storage error corrected */
-	__u32 ke :  1; /* 18 storage-key error uncorrected */
-	__u32 ds :  1; /* 19 storage degradation */
-	__u32 wp :  1; /* 20 psw mwp validity */
-	__u32 ms :  1; /* 21 psw mask and key validity */
-	__u32 pm :  1; /* 22 psw program mask and cc validity */
-	__u32 ia :  1; /* 23 psw instruction address validity */
-	__u32 fa :  1; /* 24 failing storage address validity */
-	__u32 vr :  1; /* 25 vector register validity */
-	__u32 ec :  1; /* 26 external damage code validity */
-	__u32 fp :  1; /* 27 floating point register validity */
-	__u32 gr :  1; /* 28 general register validity */
-	__u32 cr :  1; /* 29 control register validity */
-	__u32	 :  1; /* 30 */
-	__u32 st :  1; /* 31 storage logical validity */
-	__u32 ie :  1; /* 32 indirect storage error */
-	__u32 ar :  1; /* 33 access register validity */
-	__u32 da :  1; /* 34 delayed access exception */
-	__u32	 :  7; /* 35-41 */
-	__u32 pr :  1; /* 42 tod programmable register validity */
-	__u32 fc :  1; /* 43 fp control register validity */
-	__u32 ap :  1; /* 44 ancillary report */
-	__u32	 :  1; /* 45 */
-	__u32 ct :  1; /* 46 cpu timer validity */
-	__u32 cc :  1; /* 47 clock comparator validity */
-	__u32	 : 16; /* 47-63 */
+#define MCCK_CODE_SYSTEM_DAMAGE		_BITUL(63)
+#define MCCK_CODE_CPU_TIMER_VALID	_BITUL(63 - 46)
+#define MCCK_CODE_PSW_MWP_VALID		_BITUL(63 - 20)
+#define MCCK_CODE_PSW_IA_VALID		_BITUL(63 - 23)
+
+#ifndef __ASSEMBLY__
+
+union mci {
+	unsigned long val;
+	struct {
+		u64 sd :  1; /* 00 system damage */
+		u64 pd :  1; /* 01 instruction-processing damage */
+		u64 sr :  1; /* 02 system recovery */
+		u64    :  1; /* 03 */
+		u64 cd :  1; /* 04 timing-facility damage */
+		u64 ed :  1; /* 05 external damage */
+		u64    :  1; /* 06 */
+		u64 dg :  1; /* 07 degradation */
+		u64 w  :  1; /* 08 warning pending */
+		u64 cp :  1; /* 09 channel-report pending */
+		u64 sp :  1; /* 10 service-processor damage */
+		u64 ck :  1; /* 11 channel-subsystem damage */
+		u64    :  2; /* 12-13 */
+		u64 b  :  1; /* 14 backed up */
+		u64    :  1; /* 15 */
+		u64 se :  1; /* 16 storage error uncorrected */
+		u64 sc :  1; /* 17 storage error corrected */
+		u64 ke :  1; /* 18 storage-key error uncorrected */
+		u64 ds :  1; /* 19 storage degradation */
+		u64 wp :  1; /* 20 psw mwp validity */
+		u64 ms :  1; /* 21 psw mask and key validity */
+		u64 pm :  1; /* 22 psw program mask and cc validity */
+		u64 ia :  1; /* 23 psw instruction address validity */
+		u64 fa :  1; /* 24 failing storage address validity */
+		u64 vr :  1; /* 25 vector register validity */
+		u64 ec :  1; /* 26 external damage code validity */
+		u64 fp :  1; /* 27 floating point register validity */
+		u64 gr :  1; /* 28 general register validity */
+		u64 cr :  1; /* 29 control register validity */
+		u64    :  1; /* 30 */
+		u64 st :  1; /* 31 storage logical validity */
+		u64 ie :  1; /* 32 indirect storage error */
+		u64 ar :  1; /* 33 access register validity */
+		u64 da :  1; /* 34 delayed access exception */
+		u64    :  7; /* 35-41 */
+		u64 pr :  1; /* 42 tod programmable register validity */
+		u64 fc :  1; /* 43 fp control register validity */
+		u64 ap :  1; /* 44 ancillary report */
+		u64    :  1; /* 45 */
+		u64 ct :  1; /* 46 cpu timer validity */
+		u64 cc :  1; /* 47 clock comparator validity */
+		u64    : 16; /* 47-63 */
+	};
 };
 
 struct pt_regs;
@@ -63,4 +74,5 @@ struct pt_regs;
 extern void s390_handle_mcck(void);
 extern void s390_do_machine_check(struct pt_regs *regs);
 
+#endif /* __ASSEMBLY__ */
 #endif /* _ASM_S390_NMI_H */
diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h
index bdb2f51124ed..024f85f947ae 100644
--- a/arch/s390/include/asm/pgtable.h
+++ b/arch/s390/include/asm/pgtable.h
@@ -193,9 +193,15 @@ static inline int is_module_addr(void *addr)
 #define _PAGE_UNUSED	0x080		/* SW bit for pgste usage state */
 #define __HAVE_ARCH_PTE_SPECIAL
 
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define _PAGE_SOFT_DIRTY 0x002		/* SW pte soft dirty bit */
+#else
+#define _PAGE_SOFT_DIRTY 0x000
+#endif
+
 /* Set of bits not changed in pte_modify */
 #define _PAGE_CHG_MASK		(PAGE_MASK | _PAGE_SPECIAL | _PAGE_DIRTY | \
-				 _PAGE_YOUNG)
+				 _PAGE_YOUNG | _PAGE_SOFT_DIRTY)
 
 /*
  * handle_pte_fault uses pte_present and pte_none to find out the pte type
@@ -285,6 +291,12 @@ static inline int is_module_addr(void *addr)
 #define _SEGMENT_ENTRY_READ	0x0002	/* SW segment read bit */
 #define _SEGMENT_ENTRY_WRITE	0x0001	/* SW segment write bit */
 
+#ifdef CONFIG_MEM_SOFT_DIRTY
+#define _SEGMENT_ENTRY_SOFT_DIRTY 0x4000 /* SW segment soft dirty bit */
+#else
+#define _SEGMENT_ENTRY_SOFT_DIRTY 0x0000 /* SW segment soft dirty bit */
+#endif
+
 /*
  * Segment table entry encoding (R = read-only, I = invalid, y = young bit):
  *				dy..R...I...wr
@@ -589,6 +601,43 @@ static inline int pmd_protnone(pmd_t pmd)
 }
 #endif
 
+static inline int pte_soft_dirty(pte_t pte)
+{
+	return pte_val(pte) & _PAGE_SOFT_DIRTY;
+}
+#define pte_swp_soft_dirty pte_soft_dirty
+
+static inline pte_t pte_mksoft_dirty(pte_t pte)
+{
+	pte_val(pte) |= _PAGE_SOFT_DIRTY;
+	return pte;
+}
+#define pte_swp_mksoft_dirty pte_mksoft_dirty
+
+static inline pte_t pte_clear_soft_dirty(pte_t pte)
+{
+	pte_val(pte) &= ~_PAGE_SOFT_DIRTY;
+	return pte;
+}
+#define pte_swp_clear_soft_dirty pte_clear_soft_dirty
+
+static inline int pmd_soft_dirty(pmd_t pmd)
+{
+	return pmd_val(pmd) & _SEGMENT_ENTRY_SOFT_DIRTY;
+}
+
+static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
+{
+	pmd_val(pmd) |= _SEGMENT_ENTRY_SOFT_DIRTY;
+	return pmd;
+}
+
+static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
+{
+	pmd_val(pmd) &= ~_SEGMENT_ENTRY_SOFT_DIRTY;
+	return pmd;
+}
+
 static inline pgste_t pgste_get_lock(pte_t *ptep)
 {
 	unsigned long new = 0;
@@ -889,7 +938,7 @@ static inline pte_t pte_mkclean(pte_t pte)
 
 static inline pte_t pte_mkdirty(pte_t pte)
 {
-	pte_val(pte) |= _PAGE_DIRTY;
+	pte_val(pte) |= _PAGE_DIRTY | _PAGE_SOFT_DIRTY;
 	if (pte_val(pte) & _PAGE_WRITE)
 		pte_val(pte) &= ~_PAGE_PROTECT;
 	return pte;
@@ -1218,8 +1267,10 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
 					pte_t entry, int dirty)
 {
 	pgste_t pgste;
+	pte_t oldpte;
 
-	if (pte_same(*ptep, entry))
+	oldpte = *ptep;
+	if (pte_same(oldpte, entry))
 		return 0;
 	if (mm_has_pgste(vma->vm_mm)) {
 		pgste = pgste_get_lock(ptep);
@@ -1229,7 +1280,8 @@ static inline int ptep_set_access_flags(struct vm_area_struct *vma,
 	ptep_flush_direct(vma->vm_mm, address, ptep);
 
 	if (mm_has_pgste(vma->vm_mm)) {
-		pgste_set_key(ptep, pgste, entry, vma->vm_mm);
+		if (pte_val(oldpte) & _PAGE_INVALID)
+			pgste_set_key(ptep, pgste, entry, vma->vm_mm);
 		pgste = pgste_set_pte(ptep, pgste, entry);
 		pgste_set_unlock(ptep, pgste);
 	} else
@@ -1340,7 +1392,8 @@ static inline pmd_t pmd_mkclean(pmd_t pmd)
 static inline pmd_t pmd_mkdirty(pmd_t pmd)
 {
 	if (pmd_large(pmd)) {
-		pmd_val(pmd) |= _SEGMENT_ENTRY_DIRTY;
+		pmd_val(pmd) |= _SEGMENT_ENTRY_DIRTY |
+				_SEGMENT_ENTRY_SOFT_DIRTY;
 		if (pmd_val(pmd) & _SEGMENT_ENTRY_WRITE)
 			pmd_val(pmd) &= ~_SEGMENT_ENTRY_PROTECT;
 	}
@@ -1371,7 +1424,8 @@ static inline pmd_t pmd_modify(pmd_t pmd, pgprot_t newprot)
 	if (pmd_large(pmd)) {
 		pmd_val(pmd) &= _SEGMENT_ENTRY_ORIGIN_LARGE |
 			_SEGMENT_ENTRY_DIRTY | _SEGMENT_ENTRY_YOUNG |
-			_SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SPLIT;
+			_SEGMENT_ENTRY_LARGE | _SEGMENT_ENTRY_SPLIT |
+			_SEGMENT_ENTRY_SOFT_DIRTY;
 		pmd_val(pmd) |= massage_pgprot_pmd(newprot);
 		if (!(pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY))
 			pmd_val(pmd) |= _SEGMENT_ENTRY_PROTECT;
diff --git a/arch/s390/include/asm/processor.h b/arch/s390/include/asm/processor.h
index 085fb0d3c54e..b16c3d0a1b9f 100644
--- a/arch/s390/include/asm/processor.h
+++ b/arch/s390/include/asm/processor.h
@@ -11,15 +11,19 @@
 #ifndef __ASM_S390_PROCESSOR_H
 #define __ASM_S390_PROCESSOR_H
 
+#include <linux/const.h>
+
 #define CIF_MCCK_PENDING	0	/* machine check handling is pending */
 #define CIF_ASCE		1	/* user asce needs fixup / uaccess */
 #define CIF_NOHZ_DELAY		2	/* delay HZ disable for a tick */
-#define CIF_FPU			3	/* restore vector registers */
+#define CIF_FPU			3	/* restore FPU registers */
+#define CIF_IGNORE_IRQ		4	/* ignore interrupt (for udelay) */
 
-#define _CIF_MCCK_PENDING	(1<<CIF_MCCK_PENDING)
-#define _CIF_ASCE		(1<<CIF_ASCE)
-#define _CIF_NOHZ_DELAY		(1<<CIF_NOHZ_DELAY)
-#define _CIF_FPU		(1<<CIF_FPU)
+#define _CIF_MCCK_PENDING	_BITUL(CIF_MCCK_PENDING)
+#define _CIF_ASCE		_BITUL(CIF_ASCE)
+#define _CIF_NOHZ_DELAY		_BITUL(CIF_NOHZ_DELAY)
+#define _CIF_FPU		_BITUL(CIF_FPU)
+#define _CIF_IGNORE_IRQ		_BITUL(CIF_IGNORE_IRQ)
 
 #ifndef __ASSEMBLY__
 
@@ -30,21 +34,22 @@
 #include <asm/ptrace.h>
 #include <asm/setup.h>
 #include <asm/runtime_instr.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/types.h>
+#include <asm/fpu/internal.h>
 
 static inline void set_cpu_flag(int flag)
 {
-	S390_lowcore.cpu_flags |= (1U << flag);
+	S390_lowcore.cpu_flags |= (1UL << flag);
 }
 
 static inline void clear_cpu_flag(int flag)
 {
-	S390_lowcore.cpu_flags &= ~(1U << flag);
+	S390_lowcore.cpu_flags &= ~(1UL << flag);
 }
 
 static inline int test_cpu_flag(int flag)
 {
-	return !!(S390_lowcore.cpu_flags & (1U << flag));
+	return !!(S390_lowcore.cpu_flags & (1UL << flag));
 }
 
 #define arch_needs_cpu() test_cpu_flag(CIF_NOHZ_DELAY)
@@ -102,7 +107,6 @@ struct thread_struct {
 	struct list_head list;
 	/* cpu runtime instrumentation */
 	struct runtime_instr_cb *ri_cb;
-	int ri_signum;
 	unsigned char trap_tdb[256];	/* Transaction abort diagnose block */
 };
 
@@ -139,8 +143,10 @@ struct stack_frame {
 
 #define ARCH_MIN_TASKALIGN	8
 
+extern __vector128 init_task_fpu_regs[__NUM_VXRS];
 #define INIT_THREAD {							\
 	.ksp = sizeof(init_stack) + (unsigned long) &init_stack,	\
+	.fpu.regs = (void *)&init_task_fpu_regs,			\
 }
 
 /*
@@ -217,7 +223,7 @@ static inline void __load_psw(psw_t psw)
  * Set PSW mask to specified value, while leaving the
  * PSW addr pointing to the next instruction.
  */
-static inline void __load_psw_mask (unsigned long mask)
+static inline void __load_psw_mask(unsigned long mask)
 {
 	unsigned long addr;
 	psw_t psw;
@@ -243,6 +249,16 @@ static inline unsigned long __extract_psw(void)
 	return (((unsigned long) reg1) << 32) | ((unsigned long) reg2);
 }
 
+static inline void local_mcck_enable(void)
+{
+	__load_psw_mask(__extract_psw() | PSW_MASK_MCHECK);
+}
+
+static inline void local_mcck_disable(void)
+{
+	__load_psw_mask(__extract_psw() & ~PSW_MASK_MCHECK);
+}
+
 /*
  * Rewind PSW instruction address by specified number of bytes.
  */
@@ -266,65 +282,14 @@ void enabled_wait(void);
  */
 static inline void __noreturn disabled_wait(unsigned long code)
 {
-        unsigned long ctl_buf;
-        psw_t dw_psw;
-
-	dw_psw.mask = PSW_MASK_BASE | PSW_MASK_WAIT | PSW_MASK_BA | PSW_MASK_EA;
-        dw_psw.addr = code;
-        /* 
-         * Store status and then load disabled wait psw,
-         * the processor is dead afterwards
-         */
-	asm volatile(
-		"	stctg	0,0,0(%2)\n"
-		"	ni	4(%2),0xef\n"	/* switch off protection */
-		"	lctlg	0,0,0(%2)\n"
-		"	lghi	1,0x1000\n"
-		"	stpt	0x328(1)\n"	/* store timer */
-		"	stckc	0x330(1)\n"	/* store clock comparator */
-		"	stpx	0x318(1)\n"	/* store prefix register */
-		"	stam	0,15,0x340(1)\n"/* store access registers */
-		"	stfpc	0x31c(1)\n"	/* store fpu control */
-		"	std	0,0x200(1)\n"	/* store f0 */
-		"	std	1,0x208(1)\n"	/* store f1 */
-		"	std	2,0x210(1)\n"	/* store f2 */
-		"	std	3,0x218(1)\n"	/* store f3 */
-		"	std	4,0x220(1)\n"	/* store f4 */
-		"	std	5,0x228(1)\n"	/* store f5 */
-		"	std	6,0x230(1)\n"	/* store f6 */
-		"	std	7,0x238(1)\n"	/* store f7 */
-		"	std	8,0x240(1)\n"	/* store f8 */
-		"	std	9,0x248(1)\n"	/* store f9 */
-		"	std	10,0x250(1)\n"	/* store f10 */
-		"	std	11,0x258(1)\n"	/* store f11 */
-		"	std	12,0x260(1)\n"	/* store f12 */
-		"	std	13,0x268(1)\n"	/* store f13 */
-		"	std	14,0x270(1)\n"	/* store f14 */
-		"	std	15,0x278(1)\n"	/* store f15 */
-		"	stmg	0,15,0x280(1)\n"/* store general registers */
-		"	stctg	0,15,0x380(1)\n"/* store control registers */
-		"	oi	0x384(1),0x10\n"/* fake protection bit */
-		"	lpswe	0(%1)"
-		: "=m" (ctl_buf)
-		: "a" (&dw_psw), "a" (&ctl_buf), "m" (dw_psw) : "cc", "0", "1");
-	while (1);
-}
+	psw_t psw;
 
-/*
- * Use to set psw mask except for the first byte which
- * won't be changed by this function.
- */
-static inline void
-__set_psw_mask(unsigned long mask)
-{
-	__load_psw_mask(mask | (arch_local_save_flags() & ~(-1UL >> 8)));
+	psw.mask = PSW_MASK_BASE | PSW_MASK_WAIT | PSW_MASK_BA | PSW_MASK_EA;
+	psw.addr = code;
+	__load_psw(psw);
+	while (1);
 }
 
-#define local_mcck_enable() \
-	__set_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT | PSW_MASK_MCHECK)
-#define local_mcck_disable() \
-	__set_psw_mask(PSW_KERNEL_BITS | PSW_MASK_DAT)
-
 /*
  * Basic Machine Check/Program Check Handler.
  */
diff --git a/arch/s390/include/asm/ptrace.h b/arch/s390/include/asm/ptrace.h
index 6feda2599282..37cbc50947f2 100644
--- a/arch/s390/include/asm/ptrace.h
+++ b/arch/s390/include/asm/ptrace.h
@@ -6,13 +6,14 @@
 #ifndef _S390_PTRACE_H
 #define _S390_PTRACE_H
 
+#include <linux/const.h>
 #include <uapi/asm/ptrace.h>
 
 #define PIF_SYSCALL		0	/* inside a system call */
 #define PIF_PER_TRAP		1	/* deliver sigtrap on return to user */
 
-#define _PIF_SYSCALL		(1<<PIF_SYSCALL)
-#define _PIF_PER_TRAP		(1<<PIF_PER_TRAP)
+#define _PIF_SYSCALL		_BITUL(PIF_SYSCALL)
+#define _PIF_PER_TRAP		_BITUL(PIF_PER_TRAP)
 
 #ifndef __ASSEMBLY__
 
@@ -128,17 +129,17 @@ struct per_struct_kernel {
 
 static inline void set_pt_regs_flag(struct pt_regs *regs, int flag)
 {
-	regs->flags |= (1U << flag);
+	regs->flags |= (1UL << flag);
 }
 
 static inline void clear_pt_regs_flag(struct pt_regs *regs, int flag)
 {
-	regs->flags &= ~(1U << flag);
+	regs->flags &= ~(1UL << flag);
 }
 
 static inline int test_pt_regs_flag(struct pt_regs *regs, int flag)
 {
-	return !!(regs->flags & (1U << flag));
+	return !!(regs->flags & (1UL << flag));
 }
 
 /*
diff --git a/arch/s390/include/asm/setup.h b/arch/s390/include/asm/setup.h
index b8ffc1bd0a9f..23537661da0e 100644
--- a/arch/s390/include/asm/setup.h
+++ b/arch/s390/include/asm/setup.h
@@ -5,11 +5,38 @@
 #ifndef _ASM_S390_SETUP_H
 #define _ASM_S390_SETUP_H
 
+#include <linux/const.h>
 #include <uapi/asm/setup.h>
 
 
 #define PARMAREA		0x10400
 
+/*
+ * Machine features detected in head.S
+ */
+
+#define MACHINE_FLAG_VM		_BITUL(0)
+#define MACHINE_FLAG_IEEE	_BITUL(1)
+#define MACHINE_FLAG_CSP	_BITUL(2)
+#define MACHINE_FLAG_MVPG	_BITUL(3)
+#define MACHINE_FLAG_DIAG44	_BITUL(4)
+#define MACHINE_FLAG_IDTE	_BITUL(5)
+#define MACHINE_FLAG_DIAG9C	_BITUL(6)
+#define MACHINE_FLAG_KVM	_BITUL(8)
+#define MACHINE_FLAG_ESOP	_BITUL(9)
+#define MACHINE_FLAG_EDAT1	_BITUL(10)
+#define MACHINE_FLAG_EDAT2	_BITUL(11)
+#define MACHINE_FLAG_LPAR	_BITUL(12)
+#define MACHINE_FLAG_LPP	_BITUL(13)
+#define MACHINE_FLAG_TOPOLOGY	_BITUL(14)
+#define MACHINE_FLAG_TE		_BITUL(15)
+#define MACHINE_FLAG_TLB_LC	_BITUL(17)
+#define MACHINE_FLAG_VX		_BITUL(18)
+#define MACHINE_FLAG_CAD	_BITUL(19)
+
+#define LPP_MAGIC		_BITUL(31)
+#define LPP_PFAULT_PID_MASK	_AC(0xffffffff, UL)
+
 #ifndef __ASSEMBLY__
 
 #include <asm/lowcore.h>
@@ -28,29 +55,6 @@ extern unsigned long max_physmem_end;
 
 extern void detect_memory_memblock(void);
 
-/*
- * Machine features detected in head.S
- */
-
-#define MACHINE_FLAG_VM		(1UL << 0)
-#define MACHINE_FLAG_IEEE	(1UL << 1)
-#define MACHINE_FLAG_CSP	(1UL << 2)
-#define MACHINE_FLAG_MVPG	(1UL << 3)
-#define MACHINE_FLAG_DIAG44	(1UL << 4)
-#define MACHINE_FLAG_IDTE	(1UL << 5)
-#define MACHINE_FLAG_DIAG9C	(1UL << 6)
-#define MACHINE_FLAG_KVM	(1UL << 8)
-#define MACHINE_FLAG_ESOP	(1UL << 9)
-#define MACHINE_FLAG_EDAT1	(1UL << 10)
-#define MACHINE_FLAG_EDAT2	(1UL << 11)
-#define MACHINE_FLAG_LPAR	(1UL << 12)
-#define MACHINE_FLAG_LPP	(1UL << 13)
-#define MACHINE_FLAG_TOPOLOGY	(1UL << 14)
-#define MACHINE_FLAG_TE		(1UL << 15)
-#define MACHINE_FLAG_TLB_LC	(1UL << 17)
-#define MACHINE_FLAG_VX		(1UL << 18)
-#define MACHINE_FLAG_CAD	(1UL << 19)
-
 #define MACHINE_IS_VM		(S390_lowcore.machine_flags & MACHINE_FLAG_VM)
 #define MACHINE_IS_KVM		(S390_lowcore.machine_flags & MACHINE_FLAG_KVM)
 #define MACHINE_IS_LPAR		(S390_lowcore.machine_flags & MACHINE_FLAG_LPAR)
diff --git a/arch/s390/include/asm/spinlock.h b/arch/s390/include/asm/spinlock.h
index 0e37cd041241..63ebf37d3143 100644
--- a/arch/s390/include/asm/spinlock.h
+++ b/arch/s390/include/asm/spinlock.h
@@ -87,7 +87,6 @@ static inline void arch_spin_unlock(arch_spinlock_t *lp)
 {
 	typecheck(unsigned int, lp->lock);
 	asm volatile(
-		__ASM_BARRIER
 		"st	%1,%0\n"
 		: "+Q" (lp->lock)
 		: "d" (0)
@@ -169,7 +168,6 @@ static inline int arch_write_trylock_once(arch_rwlock_t *rw)
 							\
 	typecheck(unsigned int *, ptr);			\
 	asm volatile(					\
-		"bcr	14,0\n"				\
 		op_string "	%0,%2,%1\n"		\
 		: "=d" (old_val), "+Q" (*ptr)		\
 		: "d" (op_val)				\
@@ -243,7 +241,6 @@ static inline void arch_write_unlock(arch_rwlock_t *rw)
 
 	rw->owner = 0;
 	asm volatile(
-		__ASM_BARRIER
 		"st	%1,%0\n"
 		: "+Q" (rw->lock)
 		: "d" (0)
diff --git a/arch/s390/include/asm/switch_to.h b/arch/s390/include/asm/switch_to.h
index dcadfde32265..12d45f0cfdd9 100644
--- a/arch/s390/include/asm/switch_to.h
+++ b/arch/s390/include/asm/switch_to.h
@@ -8,7 +8,7 @@
 #define __ASM_SWITCH_TO_H
 
 #include <linux/thread_info.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/api.h>
 #include <asm/ptrace.h>
 
 extern struct task_struct *__switch_to(void *, void *);
diff --git a/arch/s390/include/asm/thread_info.h b/arch/s390/include/asm/thread_info.h
index 4c27ec764c36..692b9247c019 100644
--- a/arch/s390/include/asm/thread_info.h
+++ b/arch/s390/include/asm/thread_info.h
@@ -7,6 +7,8 @@
 #ifndef _ASM_THREAD_INFO_H
 #define _ASM_THREAD_INFO_H
 
+#include <linux/const.h>
+
 /*
  * Size of kernel stack for each process
  */
@@ -83,16 +85,16 @@ void arch_release_task_struct(struct task_struct *tsk);
 #define TIF_BLOCK_STEP		20	/* This task is block stepped */
 #define TIF_UPROBE_SINGLESTEP	21	/* This task is uprobe single stepped */
 
-#define _TIF_NOTIFY_RESUME	(1<<TIF_NOTIFY_RESUME)
-#define _TIF_SIGPENDING		(1<<TIF_SIGPENDING)
-#define _TIF_NEED_RESCHED	(1<<TIF_NEED_RESCHED)
-#define _TIF_SYSCALL_TRACE	(1<<TIF_SYSCALL_TRACE)
-#define _TIF_SYSCALL_AUDIT	(1<<TIF_SYSCALL_AUDIT)
-#define _TIF_SECCOMP		(1<<TIF_SECCOMP)
-#define _TIF_SYSCALL_TRACEPOINT	(1<<TIF_SYSCALL_TRACEPOINT)
-#define _TIF_UPROBE		(1<<TIF_UPROBE)
-#define _TIF_31BIT		(1<<TIF_31BIT)
-#define _TIF_SINGLE_STEP	(1<<TIF_SINGLE_STEP)
+#define _TIF_NOTIFY_RESUME	_BITUL(TIF_NOTIFY_RESUME)
+#define _TIF_SIGPENDING		_BITUL(TIF_SIGPENDING)
+#define _TIF_NEED_RESCHED	_BITUL(TIF_NEED_RESCHED)
+#define _TIF_SYSCALL_TRACE	_BITUL(TIF_SYSCALL_TRACE)
+#define _TIF_SYSCALL_AUDIT	_BITUL(TIF_SYSCALL_AUDIT)
+#define _TIF_SECCOMP		_BITUL(TIF_SECCOMP)
+#define _TIF_SYSCALL_TRACEPOINT	_BITUL(TIF_SYSCALL_TRACEPOINT)
+#define _TIF_UPROBE		_BITUL(TIF_UPROBE)
+#define _TIF_31BIT		_BITUL(TIF_31BIT)
+#define _TIF_SINGLE_STEP	_BITUL(TIF_SINGLE_STEP)
 
 #define is_32bit_task()		(test_thread_flag(TIF_31BIT))
 
diff --git a/arch/s390/include/asm/trace/diag.h b/arch/s390/include/asm/trace/diag.h
new file mode 100644
index 000000000000..776f307960cc
--- /dev/null
+++ b/arch/s390/include/asm/trace/diag.h
@@ -0,0 +1,43 @@
+/*
+ * Tracepoint header for s390 diagnose calls
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM s390
+
+#if !defined(_TRACE_S390_DIAG_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_S390_DIAG_H
+
+#include <linux/tracepoint.h>
+
+#undef TRACE_INCLUDE_PATH
+#undef TRACE_INCLUDE_FILE
+
+#define TRACE_INCLUDE_PATH asm/trace
+#define TRACE_INCLUDE_FILE diag
+
+TRACE_EVENT(diagnose,
+	TP_PROTO(unsigned short nr),
+	TP_ARGS(nr),
+	TP_STRUCT__entry(
+		__field(unsigned short, nr)
+	),
+	TP_fast_assign(
+		__entry->nr = nr;
+	),
+	TP_printk("nr=0x%x", __entry->nr)
+);
+
+#ifdef CONFIG_TRACEPOINTS
+void trace_diagnose_norecursion(int diag_nr);
+#else
+static inline void trace_diagnose_norecursion(int diag_nr) { }
+#endif
+
+#endif /* _TRACE_S390_DIAG_H */
+
+/* This part must be outside protection */
+#include <trace/define_trace.h>
diff --git a/arch/s390/kernel/Makefile b/arch/s390/kernel/Makefile
index b756c6348ac6..dc167a23b920 100644
--- a/arch/s390/kernel/Makefile
+++ b/arch/s390/kernel/Makefile
@@ -66,6 +66,8 @@ obj-$(CONFIG_UPROBES)		+= uprobes.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_event.o perf_cpum_cf.o perf_cpum_sf.o
 obj-$(CONFIG_PERF_EVENTS)	+= perf_cpum_cf_events.o
 
+obj-$(CONFIG_TRACEPOINTS)	+= trace.o
+
 # vdso
 obj-y				+= vdso64/
 obj-$(CONFIG_COMPAT)		+= vdso32/
diff --git a/arch/s390/kernel/asm-offsets.c b/arch/s390/kernel/asm-offsets.c
index 3aeeb1b562c0..9cd248f637c7 100644
--- a/arch/s390/kernel/asm-offsets.c
+++ b/arch/s390/kernel/asm-offsets.c
@@ -23,59 +23,64 @@
 
 int main(void)
 {
-	DEFINE(__TASK_thread_info, offsetof(struct task_struct, stack));
-	DEFINE(__TASK_thread, offsetof(struct task_struct, thread));
-	DEFINE(__TASK_pid, offsetof(struct task_struct, pid));
+	/* task struct offsets */
+	OFFSET(__TASK_thread_info, task_struct, stack);
+	OFFSET(__TASK_thread, task_struct, thread);
+	OFFSET(__TASK_pid, task_struct, pid);
 	BLANK();
-	DEFINE(__THREAD_ksp, offsetof(struct thread_struct, ksp));
-	DEFINE(__THREAD_FPU_fpc, offsetof(struct thread_struct, fpu.fpc));
-	DEFINE(__THREAD_FPU_flags, offsetof(struct thread_struct, fpu.flags));
-	DEFINE(__THREAD_FPU_regs, offsetof(struct thread_struct, fpu.regs));
-	DEFINE(__THREAD_per_cause, offsetof(struct thread_struct, per_event.cause));
-	DEFINE(__THREAD_per_address, offsetof(struct thread_struct, per_event.address));
-	DEFINE(__THREAD_per_paid, offsetof(struct thread_struct, per_event.paid));
-	DEFINE(__THREAD_trap_tdb, offsetof(struct thread_struct, trap_tdb));
+	/* thread struct offsets */
+	OFFSET(__THREAD_ksp, thread_struct, ksp);
+	OFFSET(__THREAD_FPU_fpc, thread_struct, fpu.fpc);
+	OFFSET(__THREAD_FPU_regs, thread_struct, fpu.regs);
+	OFFSET(__THREAD_per_cause, thread_struct, per_event.cause);
+	OFFSET(__THREAD_per_address, thread_struct, per_event.address);
+	OFFSET(__THREAD_per_paid, thread_struct, per_event.paid);
+	OFFSET(__THREAD_trap_tdb, thread_struct, trap_tdb);
 	BLANK();
-	DEFINE(__TI_task, offsetof(struct thread_info, task));
-	DEFINE(__TI_flags, offsetof(struct thread_info, flags));
-	DEFINE(__TI_sysc_table, offsetof(struct thread_info, sys_call_table));
-	DEFINE(__TI_cpu, offsetof(struct thread_info, cpu));
-	DEFINE(__TI_precount, offsetof(struct thread_info, preempt_count));
-	DEFINE(__TI_user_timer, offsetof(struct thread_info, user_timer));
-	DEFINE(__TI_system_timer, offsetof(struct thread_info, system_timer));
-	DEFINE(__TI_last_break, offsetof(struct thread_info, last_break));
+	/* thread info offsets */
+	OFFSET(__TI_task, thread_info, task);
+	OFFSET(__TI_flags, thread_info, flags);
+	OFFSET(__TI_sysc_table, thread_info, sys_call_table);
+	OFFSET(__TI_cpu, thread_info, cpu);
+	OFFSET(__TI_precount, thread_info, preempt_count);
+	OFFSET(__TI_user_timer, thread_info, user_timer);
+	OFFSET(__TI_system_timer, thread_info, system_timer);
+	OFFSET(__TI_last_break, thread_info, last_break);
 	BLANK();
-	DEFINE(__PT_ARGS, offsetof(struct pt_regs, args));
-	DEFINE(__PT_PSW, offsetof(struct pt_regs, psw));
-	DEFINE(__PT_GPRS, offsetof(struct pt_regs, gprs));
-	DEFINE(__PT_ORIG_GPR2, offsetof(struct pt_regs, orig_gpr2));
-	DEFINE(__PT_INT_CODE, offsetof(struct pt_regs, int_code));
-	DEFINE(__PT_INT_PARM, offsetof(struct pt_regs, int_parm));
-	DEFINE(__PT_INT_PARM_LONG, offsetof(struct pt_regs, int_parm_long));
-	DEFINE(__PT_FLAGS, offsetof(struct pt_regs, flags));
+	/* pt_regs offsets */
+	OFFSET(__PT_ARGS, pt_regs, args);
+	OFFSET(__PT_PSW, pt_regs, psw);
+	OFFSET(__PT_GPRS, pt_regs, gprs);
+	OFFSET(__PT_ORIG_GPR2, pt_regs, orig_gpr2);
+	OFFSET(__PT_INT_CODE, pt_regs, int_code);
+	OFFSET(__PT_INT_PARM, pt_regs, int_parm);
+	OFFSET(__PT_INT_PARM_LONG, pt_regs, int_parm_long);
+	OFFSET(__PT_FLAGS, pt_regs, flags);
 	DEFINE(__PT_SIZE, sizeof(struct pt_regs));
 	BLANK();
-	DEFINE(__SF_BACKCHAIN, offsetof(struct stack_frame, back_chain));
-	DEFINE(__SF_GPRS, offsetof(struct stack_frame, gprs));
-	DEFINE(__SF_EMPTY, offsetof(struct stack_frame, empty1));
+	/* stack_frame offsets */
+	OFFSET(__SF_BACKCHAIN, stack_frame, back_chain);
+	OFFSET(__SF_GPRS, stack_frame, gprs);
+	OFFSET(__SF_EMPTY, stack_frame, empty1);
 	BLANK();
 	/* timeval/timezone offsets for use by vdso */
-	DEFINE(__VDSO_UPD_COUNT, offsetof(struct vdso_data, tb_update_count));
-	DEFINE(__VDSO_XTIME_STAMP, offsetof(struct vdso_data, xtime_tod_stamp));
-	DEFINE(__VDSO_XTIME_SEC, offsetof(struct vdso_data, xtime_clock_sec));
-	DEFINE(__VDSO_XTIME_NSEC, offsetof(struct vdso_data, xtime_clock_nsec));
-	DEFINE(__VDSO_XTIME_CRS_SEC, offsetof(struct vdso_data, xtime_coarse_sec));
-	DEFINE(__VDSO_XTIME_CRS_NSEC, offsetof(struct vdso_data, xtime_coarse_nsec));
-	DEFINE(__VDSO_WTOM_SEC, offsetof(struct vdso_data, wtom_clock_sec));
-	DEFINE(__VDSO_WTOM_NSEC, offsetof(struct vdso_data, wtom_clock_nsec));
-	DEFINE(__VDSO_WTOM_CRS_SEC, offsetof(struct vdso_data, wtom_coarse_sec));
-	DEFINE(__VDSO_WTOM_CRS_NSEC, offsetof(struct vdso_data, wtom_coarse_nsec));
-	DEFINE(__VDSO_TIMEZONE, offsetof(struct vdso_data, tz_minuteswest));
-	DEFINE(__VDSO_ECTG_OK, offsetof(struct vdso_data, ectg_available));
-	DEFINE(__VDSO_TK_MULT, offsetof(struct vdso_data, tk_mult));
-	DEFINE(__VDSO_TK_SHIFT, offsetof(struct vdso_data, tk_shift));
-	DEFINE(__VDSO_ECTG_BASE, offsetof(struct vdso_per_cpu_data, ectg_timer_base));
-	DEFINE(__VDSO_ECTG_USER, offsetof(struct vdso_per_cpu_data, ectg_user_time));
+	OFFSET(__VDSO_UPD_COUNT, vdso_data, tb_update_count);
+	OFFSET(__VDSO_XTIME_STAMP, vdso_data, xtime_tod_stamp);
+	OFFSET(__VDSO_XTIME_SEC, vdso_data, xtime_clock_sec);
+	OFFSET(__VDSO_XTIME_NSEC, vdso_data, xtime_clock_nsec);
+	OFFSET(__VDSO_XTIME_CRS_SEC, vdso_data, xtime_coarse_sec);
+	OFFSET(__VDSO_XTIME_CRS_NSEC, vdso_data, xtime_coarse_nsec);
+	OFFSET(__VDSO_WTOM_SEC, vdso_data, wtom_clock_sec);
+	OFFSET(__VDSO_WTOM_NSEC, vdso_data, wtom_clock_nsec);
+	OFFSET(__VDSO_WTOM_CRS_SEC, vdso_data, wtom_coarse_sec);
+	OFFSET(__VDSO_WTOM_CRS_NSEC, vdso_data, wtom_coarse_nsec);
+	OFFSET(__VDSO_TIMEZONE, vdso_data, tz_minuteswest);
+	OFFSET(__VDSO_ECTG_OK, vdso_data, ectg_available);
+	OFFSET(__VDSO_TK_MULT, vdso_data, tk_mult);
+	OFFSET(__VDSO_TK_SHIFT, vdso_data, tk_shift);
+	OFFSET(__VDSO_ECTG_BASE, vdso_per_cpu_data, ectg_timer_base);
+	OFFSET(__VDSO_ECTG_USER, vdso_per_cpu_data, ectg_user_time);
+	BLANK();
 	/* constants used by the vdso */
 	DEFINE(__CLOCK_REALTIME, CLOCK_REALTIME);
 	DEFINE(__CLOCK_MONOTONIC, CLOCK_MONOTONIC);
@@ -86,102 +91,105 @@ int main(void)
 	DEFINE(__CLOCK_COARSE_RES, LOW_RES_NSEC);
 	BLANK();
 	/* idle data offsets */
-	DEFINE(__CLOCK_IDLE_ENTER, offsetof(struct s390_idle_data, clock_idle_enter));
-	DEFINE(__CLOCK_IDLE_EXIT, offsetof(struct s390_idle_data, clock_idle_exit));
-	DEFINE(__TIMER_IDLE_ENTER, offsetof(struct s390_idle_data, timer_idle_enter));
-	DEFINE(__TIMER_IDLE_EXIT, offsetof(struct s390_idle_data, timer_idle_exit));
-	/* lowcore offsets */
-	DEFINE(__LC_EXT_PARAMS, offsetof(struct _lowcore, ext_params));
-	DEFINE(__LC_EXT_CPU_ADDR, offsetof(struct _lowcore, ext_cpu_addr));
-	DEFINE(__LC_EXT_INT_CODE, offsetof(struct _lowcore, ext_int_code));
-	DEFINE(__LC_SVC_ILC, offsetof(struct _lowcore, svc_ilc));
-	DEFINE(__LC_SVC_INT_CODE, offsetof(struct _lowcore, svc_code));
-	DEFINE(__LC_PGM_ILC, offsetof(struct _lowcore, pgm_ilc));
-	DEFINE(__LC_PGM_INT_CODE, offsetof(struct _lowcore, pgm_code));
-	DEFINE(__LC_TRANS_EXC_CODE, offsetof(struct _lowcore, trans_exc_code));
-	DEFINE(__LC_MON_CLASS_NR, offsetof(struct _lowcore, mon_class_num));
-	DEFINE(__LC_PER_CODE, offsetof(struct _lowcore, per_code));
-	DEFINE(__LC_PER_ATMID, offsetof(struct _lowcore, per_atmid));
-	DEFINE(__LC_PER_ADDRESS, offsetof(struct _lowcore, per_address));
-	DEFINE(__LC_EXC_ACCESS_ID, offsetof(struct _lowcore, exc_access_id));
-	DEFINE(__LC_PER_ACCESS_ID, offsetof(struct _lowcore, per_access_id));
-	DEFINE(__LC_OP_ACCESS_ID, offsetof(struct _lowcore, op_access_id));
-	DEFINE(__LC_AR_MODE_ID, offsetof(struct _lowcore, ar_mode_id));
-	DEFINE(__LC_MON_CODE, offsetof(struct _lowcore, monitor_code));
-	DEFINE(__LC_SUBCHANNEL_ID, offsetof(struct _lowcore, subchannel_id));
-	DEFINE(__LC_SUBCHANNEL_NR, offsetof(struct _lowcore, subchannel_nr));
-	DEFINE(__LC_IO_INT_PARM, offsetof(struct _lowcore, io_int_parm));
-	DEFINE(__LC_IO_INT_WORD, offsetof(struct _lowcore, io_int_word));
-	DEFINE(__LC_STFL_FAC_LIST, offsetof(struct _lowcore, stfl_fac_list));
-	DEFINE(__LC_MCCK_CODE, offsetof(struct _lowcore, mcck_interruption_code));
-	DEFINE(__LC_MCCK_EXT_DAM_CODE, offsetof(struct _lowcore, external_damage_code));
-	DEFINE(__LC_RST_OLD_PSW, offsetof(struct _lowcore, restart_old_psw));
-	DEFINE(__LC_EXT_OLD_PSW, offsetof(struct _lowcore, external_old_psw));
-	DEFINE(__LC_SVC_OLD_PSW, offsetof(struct _lowcore, svc_old_psw));
-	DEFINE(__LC_PGM_OLD_PSW, offsetof(struct _lowcore, program_old_psw));
-	DEFINE(__LC_MCK_OLD_PSW, offsetof(struct _lowcore, mcck_old_psw));
-	DEFINE(__LC_IO_OLD_PSW, offsetof(struct _lowcore, io_old_psw));
-	DEFINE(__LC_RST_NEW_PSW, offsetof(struct _lowcore, restart_psw));
-	DEFINE(__LC_EXT_NEW_PSW, offsetof(struct _lowcore, external_new_psw));
-	DEFINE(__LC_SVC_NEW_PSW, offsetof(struct _lowcore, svc_new_psw));
-	DEFINE(__LC_PGM_NEW_PSW, offsetof(struct _lowcore, program_new_psw));
-	DEFINE(__LC_MCK_NEW_PSW, offsetof(struct _lowcore, mcck_new_psw));
-	DEFINE(__LC_IO_NEW_PSW, offsetof(struct _lowcore, io_new_psw));
+	OFFSET(__CLOCK_IDLE_ENTER, s390_idle_data, clock_idle_enter);
+	OFFSET(__CLOCK_IDLE_EXIT, s390_idle_data, clock_idle_exit);
+	OFFSET(__TIMER_IDLE_ENTER, s390_idle_data, timer_idle_enter);
+	OFFSET(__TIMER_IDLE_EXIT, s390_idle_data, timer_idle_exit);
 	BLANK();
-	DEFINE(__LC_SAVE_AREA_SYNC, offsetof(struct _lowcore, save_area_sync));
-	DEFINE(__LC_SAVE_AREA_ASYNC, offsetof(struct _lowcore, save_area_async));
-	DEFINE(__LC_SAVE_AREA_RESTART, offsetof(struct _lowcore, save_area_restart));
-	DEFINE(__LC_CPU_FLAGS, offsetof(struct _lowcore, cpu_flags));
-	DEFINE(__LC_RETURN_PSW, offsetof(struct _lowcore, return_psw));
-	DEFINE(__LC_RETURN_MCCK_PSW, offsetof(struct _lowcore, return_mcck_psw));
-	DEFINE(__LC_SYNC_ENTER_TIMER, offsetof(struct _lowcore, sync_enter_timer));
-	DEFINE(__LC_ASYNC_ENTER_TIMER, offsetof(struct _lowcore, async_enter_timer));
-	DEFINE(__LC_MCCK_ENTER_TIMER, offsetof(struct _lowcore, mcck_enter_timer));
-	DEFINE(__LC_EXIT_TIMER, offsetof(struct _lowcore, exit_timer));
-	DEFINE(__LC_USER_TIMER, offsetof(struct _lowcore, user_timer));
-	DEFINE(__LC_SYSTEM_TIMER, offsetof(struct _lowcore, system_timer));
-	DEFINE(__LC_STEAL_TIMER, offsetof(struct _lowcore, steal_timer));
-	DEFINE(__LC_LAST_UPDATE_TIMER, offsetof(struct _lowcore, last_update_timer));
-	DEFINE(__LC_LAST_UPDATE_CLOCK, offsetof(struct _lowcore, last_update_clock));
-	DEFINE(__LC_CURRENT, offsetof(struct _lowcore, current_task));
-	DEFINE(__LC_CURRENT_PID, offsetof(struct _lowcore, current_pid));
-	DEFINE(__LC_THREAD_INFO, offsetof(struct _lowcore, thread_info));
-	DEFINE(__LC_KERNEL_STACK, offsetof(struct _lowcore, kernel_stack));
-	DEFINE(__LC_ASYNC_STACK, offsetof(struct _lowcore, async_stack));
-	DEFINE(__LC_PANIC_STACK, offsetof(struct _lowcore, panic_stack));
-	DEFINE(__LC_RESTART_STACK, offsetof(struct _lowcore, restart_stack));
-	DEFINE(__LC_RESTART_FN, offsetof(struct _lowcore, restart_fn));
-	DEFINE(__LC_RESTART_DATA, offsetof(struct _lowcore, restart_data));
-	DEFINE(__LC_RESTART_SOURCE, offsetof(struct _lowcore, restart_source));
-	DEFINE(__LC_KERNEL_ASCE, offsetof(struct _lowcore, kernel_asce));
-	DEFINE(__LC_USER_ASCE, offsetof(struct _lowcore, user_asce));
-	DEFINE(__LC_INT_CLOCK, offsetof(struct _lowcore, int_clock));
-	DEFINE(__LC_MCCK_CLOCK, offsetof(struct _lowcore, mcck_clock));
-	DEFINE(__LC_MACHINE_FLAGS, offsetof(struct _lowcore, machine_flags));
-	DEFINE(__LC_DUMP_REIPL, offsetof(struct _lowcore, ipib));
+	/* hardware defined lowcore locations 0x000 - 0x1ff */
+	OFFSET(__LC_EXT_PARAMS, _lowcore, ext_params);
+	OFFSET(__LC_EXT_CPU_ADDR, _lowcore, ext_cpu_addr);
+	OFFSET(__LC_EXT_INT_CODE, _lowcore, ext_int_code);
+	OFFSET(__LC_SVC_ILC, _lowcore, svc_ilc);
+	OFFSET(__LC_SVC_INT_CODE, _lowcore, svc_code);
+	OFFSET(__LC_PGM_ILC, _lowcore, pgm_ilc);
+	OFFSET(__LC_PGM_INT_CODE, _lowcore, pgm_code);
+	OFFSET(__LC_DATA_EXC_CODE, _lowcore, data_exc_code);
+	OFFSET(__LC_MON_CLASS_NR, _lowcore, mon_class_num);
+	OFFSET(__LC_PER_CODE, _lowcore, per_code);
+	OFFSET(__LC_PER_ATMID, _lowcore, per_atmid);
+	OFFSET(__LC_PER_ADDRESS, _lowcore, per_address);
+	OFFSET(__LC_EXC_ACCESS_ID, _lowcore, exc_access_id);
+	OFFSET(__LC_PER_ACCESS_ID, _lowcore, per_access_id);
+	OFFSET(__LC_OP_ACCESS_ID, _lowcore, op_access_id);
+	OFFSET(__LC_AR_MODE_ID, _lowcore, ar_mode_id);
+	OFFSET(__LC_TRANS_EXC_CODE, _lowcore, trans_exc_code);
+	OFFSET(__LC_MON_CODE, _lowcore, monitor_code);
+	OFFSET(__LC_SUBCHANNEL_ID, _lowcore, subchannel_id);
+	OFFSET(__LC_SUBCHANNEL_NR, _lowcore, subchannel_nr);
+	OFFSET(__LC_IO_INT_PARM, _lowcore, io_int_parm);
+	OFFSET(__LC_IO_INT_WORD, _lowcore, io_int_word);
+	OFFSET(__LC_STFL_FAC_LIST, _lowcore, stfl_fac_list);
+	OFFSET(__LC_MCCK_CODE, _lowcore, mcck_interruption_code);
+	OFFSET(__LC_MCCK_FAIL_STOR_ADDR, _lowcore, failing_storage_address);
+	OFFSET(__LC_LAST_BREAK, _lowcore, breaking_event_addr);
+	OFFSET(__LC_RST_OLD_PSW, _lowcore, restart_old_psw);
+	OFFSET(__LC_EXT_OLD_PSW, _lowcore, external_old_psw);
+	OFFSET(__LC_SVC_OLD_PSW, _lowcore, svc_old_psw);
+	OFFSET(__LC_PGM_OLD_PSW, _lowcore, program_old_psw);
+	OFFSET(__LC_MCK_OLD_PSW, _lowcore, mcck_old_psw);
+	OFFSET(__LC_IO_OLD_PSW, _lowcore, io_old_psw);
+	OFFSET(__LC_RST_NEW_PSW, _lowcore, restart_psw);
+	OFFSET(__LC_EXT_NEW_PSW, _lowcore, external_new_psw);
+	OFFSET(__LC_SVC_NEW_PSW, _lowcore, svc_new_psw);
+	OFFSET(__LC_PGM_NEW_PSW, _lowcore, program_new_psw);
+	OFFSET(__LC_MCK_NEW_PSW, _lowcore, mcck_new_psw);
+	OFFSET(__LC_IO_NEW_PSW, _lowcore, io_new_psw);
+	/* software defined lowcore locations 0x200 - 0xdff*/
+	OFFSET(__LC_SAVE_AREA_SYNC, _lowcore, save_area_sync);
+	OFFSET(__LC_SAVE_AREA_ASYNC, _lowcore, save_area_async);
+	OFFSET(__LC_SAVE_AREA_RESTART, _lowcore, save_area_restart);
+	OFFSET(__LC_CPU_FLAGS, _lowcore, cpu_flags);
+	OFFSET(__LC_RETURN_PSW, _lowcore, return_psw);
+	OFFSET(__LC_RETURN_MCCK_PSW, _lowcore, return_mcck_psw);
+	OFFSET(__LC_SYNC_ENTER_TIMER, _lowcore, sync_enter_timer);
+	OFFSET(__LC_ASYNC_ENTER_TIMER, _lowcore, async_enter_timer);
+	OFFSET(__LC_MCCK_ENTER_TIMER, _lowcore, mcck_enter_timer);
+	OFFSET(__LC_EXIT_TIMER, _lowcore, exit_timer);
+	OFFSET(__LC_USER_TIMER, _lowcore, user_timer);
+	OFFSET(__LC_SYSTEM_TIMER, _lowcore, system_timer);
+	OFFSET(__LC_STEAL_TIMER, _lowcore, steal_timer);
+	OFFSET(__LC_LAST_UPDATE_TIMER, _lowcore, last_update_timer);
+	OFFSET(__LC_LAST_UPDATE_CLOCK, _lowcore, last_update_clock);
+	OFFSET(__LC_INT_CLOCK, _lowcore, int_clock);
+	OFFSET(__LC_MCCK_CLOCK, _lowcore, mcck_clock);
+	OFFSET(__LC_CURRENT, _lowcore, current_task);
+	OFFSET(__LC_THREAD_INFO, _lowcore, thread_info);
+	OFFSET(__LC_KERNEL_STACK, _lowcore, kernel_stack);
+	OFFSET(__LC_ASYNC_STACK, _lowcore, async_stack);
+	OFFSET(__LC_PANIC_STACK, _lowcore, panic_stack);
+	OFFSET(__LC_RESTART_STACK, _lowcore, restart_stack);
+	OFFSET(__LC_RESTART_FN, _lowcore, restart_fn);
+	OFFSET(__LC_RESTART_DATA, _lowcore, restart_data);
+	OFFSET(__LC_RESTART_SOURCE, _lowcore, restart_source);
+	OFFSET(__LC_USER_ASCE, _lowcore, user_asce);
+	OFFSET(__LC_LPP, _lowcore, lpp);
+	OFFSET(__LC_CURRENT_PID, _lowcore, current_pid);
+	OFFSET(__LC_PERCPU_OFFSET, _lowcore, percpu_offset);
+	OFFSET(__LC_VDSO_PER_CPU, _lowcore, vdso_per_cpu_data);
+	OFFSET(__LC_MACHINE_FLAGS, _lowcore, machine_flags);
+	OFFSET(__LC_GMAP, _lowcore, gmap);
+	OFFSET(__LC_PASTE, _lowcore, paste);
+	/* software defined ABI-relevant lowcore locations 0xe00 - 0xe20 */
+	OFFSET(__LC_DUMP_REIPL, _lowcore, ipib);
+	/* hardware defined lowcore locations 0x1000 - 0x18ff */
+	OFFSET(__LC_VX_SAVE_AREA_ADDR, _lowcore, vector_save_area_addr);
+	OFFSET(__LC_EXT_PARAMS2, _lowcore, ext_params2);
+	OFFSET(SAVE_AREA_BASE, _lowcore, floating_pt_save_area);
+	OFFSET(__LC_FPREGS_SAVE_AREA, _lowcore, floating_pt_save_area);
+	OFFSET(__LC_GPREGS_SAVE_AREA, _lowcore, gpregs_save_area);
+	OFFSET(__LC_PSW_SAVE_AREA, _lowcore, psw_save_area);
+	OFFSET(__LC_PREFIX_SAVE_AREA, _lowcore, prefixreg_save_area);
+	OFFSET(__LC_FP_CREG_SAVE_AREA, _lowcore, fpt_creg_save_area);
+	OFFSET(__LC_CPU_TIMER_SAVE_AREA, _lowcore, cpu_timer_save_area);
+	OFFSET(__LC_CLOCK_COMP_SAVE_AREA, _lowcore, clock_comp_save_area);
+	OFFSET(__LC_AREGS_SAVE_AREA, _lowcore, access_regs_save_area);
+	OFFSET(__LC_CREGS_SAVE_AREA, _lowcore, cregs_save_area);
+	OFFSET(__LC_PGM_TDB, _lowcore, pgm_tdb);
 	BLANK();
-	DEFINE(__LC_CPU_TIMER_SAVE_AREA, offsetof(struct _lowcore, cpu_timer_save_area));
-	DEFINE(__LC_CLOCK_COMP_SAVE_AREA, offsetof(struct _lowcore, clock_comp_save_area));
-	DEFINE(__LC_PSW_SAVE_AREA, offsetof(struct _lowcore, psw_save_area));
-	DEFINE(__LC_PREFIX_SAVE_AREA, offsetof(struct _lowcore, prefixreg_save_area));
-	DEFINE(__LC_AREGS_SAVE_AREA, offsetof(struct _lowcore, access_regs_save_area));
-	DEFINE(__LC_FPREGS_SAVE_AREA, offsetof(struct _lowcore, floating_pt_save_area));
-	DEFINE(__LC_GPREGS_SAVE_AREA, offsetof(struct _lowcore, gpregs_save_area));
-	DEFINE(__LC_CREGS_SAVE_AREA, offsetof(struct _lowcore, cregs_save_area));
-	DEFINE(__LC_DATA_EXC_CODE, offsetof(struct _lowcore, data_exc_code));
-	DEFINE(__LC_MCCK_FAIL_STOR_ADDR, offsetof(struct _lowcore, failing_storage_address));
-	DEFINE(__LC_VX_SAVE_AREA_ADDR, offsetof(struct _lowcore, vector_save_area_addr));
-	DEFINE(__LC_EXT_PARAMS2, offsetof(struct _lowcore, ext_params2));
-	DEFINE(SAVE_AREA_BASE, offsetof(struct _lowcore, floating_pt_save_area));
-	DEFINE(__LC_PASTE, offsetof(struct _lowcore, paste));
-	DEFINE(__LC_FP_CREG_SAVE_AREA, offsetof(struct _lowcore, fpt_creg_save_area));
-	DEFINE(__LC_LAST_BREAK, offsetof(struct _lowcore, breaking_event_addr));
-	DEFINE(__LC_PERCPU_OFFSET, offsetof(struct _lowcore, percpu_offset));
-	DEFINE(__LC_VDSO_PER_CPU, offsetof(struct _lowcore, vdso_per_cpu_data));
-	DEFINE(__LC_GMAP, offsetof(struct _lowcore, gmap));
-	DEFINE(__LC_PGM_TDB, offsetof(struct _lowcore, pgm_tdb));
-	DEFINE(__GMAP_ASCE, offsetof(struct gmap, asce));
-	DEFINE(__SIE_PROG0C, offsetof(struct kvm_s390_sie_block, prog0c));
-	DEFINE(__SIE_PROG20, offsetof(struct kvm_s390_sie_block, prog20));
+	/* gmap/sie offsets */
+	OFFSET(__GMAP_ASCE, gmap, asce);
+	OFFSET(__SIE_PROG0C, kvm_s390_sie_block, prog0c);
+	OFFSET(__SIE_PROG20, kvm_s390_sie_block, prog20);
 	return 0;
 }
diff --git a/arch/s390/kernel/compat_signal.c b/arch/s390/kernel/compat_signal.c
index e0f9d270b30f..66c94417c0ba 100644
--- a/arch/s390/kernel/compat_signal.c
+++ b/arch/s390/kernel/compat_signal.c
@@ -249,7 +249,7 @@ static int save_sigregs_ext32(struct pt_regs *regs,
 		return -EFAULT;
 
 	/* Save vector registers to signal stack */
-	if (is_vx_task(current)) {
+	if (MACHINE_HAS_VX) {
 		for (i = 0; i < __NUM_VXRS_LOW; i++)
 			vxrs[i] = *((__u64 *)(current->thread.fpu.vxrs + i) + 1);
 		if (__copy_to_user(&sregs_ext->vxrs_low, vxrs,
@@ -277,7 +277,7 @@ static int restore_sigregs_ext32(struct pt_regs *regs,
 		*(__u32 *)&regs->gprs[i] = gprs_high[i];
 
 	/* Restore vector registers from signal stack */
-	if (is_vx_task(current)) {
+	if (MACHINE_HAS_VX) {
 		if (__copy_from_user(vxrs, &sregs_ext->vxrs_low,
 				     sizeof(sregs_ext->vxrs_low)) ||
 		    __copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW,
@@ -470,8 +470,7 @@ static int setup_rt_frame32(struct ksignal *ksig, sigset_t *set,
 	 */
 	uc_flags = UC_GPRS_HIGH;
 	if (MACHINE_HAS_VX) {
-		if (is_vx_task(current))
-			uc_flags |= UC_VXRS;
+		uc_flags |= UC_VXRS;
 	} else
 		frame_size -= sizeof(frame->uc.uc_mcontext_ext.vxrs_low) +
 			      sizeof(frame->uc.uc_mcontext_ext.vxrs_high);
diff --git a/arch/s390/kernel/cpcmd.c b/arch/s390/kernel/cpcmd.c
index 199ec92ef4fe..7f768914fb4f 100644
--- a/arch/s390/kernel/cpcmd.c
+++ b/arch/s390/kernel/cpcmd.c
@@ -14,6 +14,7 @@
 #include <linux/spinlock.h>
 #include <linux/stddef.h>
 #include <linux/string.h>
+#include <asm/diag.h>
 #include <asm/ebcdic.h>
 #include <asm/cpcmd.h>
 #include <asm/io.h>
@@ -70,6 +71,7 @@ int  __cpcmd(const char *cmd, char *response, int rlen, int *response_code)
 	memcpy(cpcmd_buf, cmd, cmdlen);
 	ASCEBC(cpcmd_buf, cmdlen);
 
+	diag_stat_inc(DIAG_STAT_X008);
 	if (response) {
 		memset(response, 0, rlen);
 		response_len = rlen;
diff --git a/arch/s390/kernel/crash_dump.c b/arch/s390/kernel/crash_dump.c
index 0c6c01eb3613..171e09bb8ea2 100644
--- a/arch/s390/kernel/crash_dump.c
+++ b/arch/s390/kernel/crash_dump.c
@@ -32,16 +32,6 @@ static struct memblock_type oldmem_type = {
 	.regions = &oldmem_region,
 };
 
-#define for_each_dump_mem_range(i, nid, p_start, p_end, p_nid)		\
-	for (i = 0, __next_mem_range(&i, nid, MEMBLOCK_NONE,		\
-				     &memblock.physmem,			\
-				     &oldmem_type, p_start,		\
-				     p_end, p_nid);			\
-	     i != (u64)ULLONG_MAX;					\
-	     __next_mem_range(&i, nid, MEMBLOCK_NONE, &memblock.physmem,\
-			      &oldmem_type,				\
-			      p_start, p_end, p_nid))
-
 struct dump_save_areas dump_save_areas;
 
 /*
@@ -515,7 +505,8 @@ static int get_mem_chunk_cnt(void)
 	int cnt = 0;
 	u64 idx;
 
-	for_each_dump_mem_range(idx, NUMA_NO_NODE, NULL, NULL, NULL)
+	for_each_mem_range(idx, &memblock.physmem, &oldmem_type, NUMA_NO_NODE,
+			   MEMBLOCK_NONE, NULL, NULL, NULL)
 		cnt++;
 	return cnt;
 }
@@ -528,7 +519,8 @@ static void loads_init(Elf64_Phdr *phdr, u64 loads_offset)
 	phys_addr_t start, end;
 	u64 idx;
 
-	for_each_dump_mem_range(idx, NUMA_NO_NODE, &start, &end, NULL) {
+	for_each_mem_range(idx, &memblock.physmem, &oldmem_type, NUMA_NO_NODE,
+			   MEMBLOCK_NONE, &start, &end, NULL) {
 		phdr->p_filesz = end - start;
 		phdr->p_type = PT_LOAD;
 		phdr->p_offset = start;
diff --git a/arch/s390/kernel/diag.c b/arch/s390/kernel/diag.c
index 2f69243bf700..f98766ede4e1 100644
--- a/arch/s390/kernel/diag.c
+++ b/arch/s390/kernel/diag.c
@@ -6,12 +6,137 @@
  */
 
 #include <linux/module.h>
+#include <linux/cpu.h>
+#include <linux/seq_file.h>
+#include <linux/debugfs.h>
 #include <asm/diag.h>
+#include <asm/trace/diag.h>
+
+struct diag_stat {
+	unsigned int counter[NR_DIAG_STAT];
+};
+
+static DEFINE_PER_CPU(struct diag_stat, diag_stat);
+
+struct diag_desc {
+	int code;
+	char *name;
+};
+
+static const struct diag_desc diag_map[NR_DIAG_STAT] = {
+	[DIAG_STAT_X008] = { .code = 0x008, .name = "Console Function" },
+	[DIAG_STAT_X00C] = { .code = 0x00c, .name = "Pseudo Timer" },
+	[DIAG_STAT_X010] = { .code = 0x010, .name = "Release Pages" },
+	[DIAG_STAT_X014] = { .code = 0x014, .name = "Spool File Services" },
+	[DIAG_STAT_X044] = { .code = 0x044, .name = "Voluntary Timeslice End" },
+	[DIAG_STAT_X064] = { .code = 0x064, .name = "NSS Manipulation" },
+	[DIAG_STAT_X09C] = { .code = 0x09c, .name = "Relinquish Timeslice" },
+	[DIAG_STAT_X0DC] = { .code = 0x0dc, .name = "Appldata Control" },
+	[DIAG_STAT_X204] = { .code = 0x204, .name = "Logical-CPU Utilization" },
+	[DIAG_STAT_X210] = { .code = 0x210, .name = "Device Information" },
+	[DIAG_STAT_X224] = { .code = 0x224, .name = "EBCDIC-Name Table" },
+	[DIAG_STAT_X250] = { .code = 0x250, .name = "Block I/O" },
+	[DIAG_STAT_X258] = { .code = 0x258, .name = "Page-Reference Services" },
+	[DIAG_STAT_X288] = { .code = 0x288, .name = "Time Bomb" },
+	[DIAG_STAT_X2C4] = { .code = 0x2c4, .name = "FTP Services" },
+	[DIAG_STAT_X2FC] = { .code = 0x2fc, .name = "Guest Performance Data" },
+	[DIAG_STAT_X304] = { .code = 0x304, .name = "Partition-Resource Service" },
+	[DIAG_STAT_X308] = { .code = 0x308, .name = "List-Directed IPL" },
+	[DIAG_STAT_X500] = { .code = 0x500, .name = "Virtio Service" },
+};
+
+static int show_diag_stat(struct seq_file *m, void *v)
+{
+	struct diag_stat *stat;
+	unsigned long n = (unsigned long) v - 1;
+	int cpu, prec, tmp;
+
+	get_online_cpus();
+	if (n == 0) {
+		seq_puts(m, "         ");
+
+		for_each_online_cpu(cpu) {
+			prec = 10;
+			for (tmp = 10; cpu >= tmp; tmp *= 10)
+				prec--;
+			seq_printf(m, "%*s%d", prec, "CPU", cpu);
+		}
+		seq_putc(m, '\n');
+	} else if (n <= NR_DIAG_STAT) {
+		seq_printf(m, "diag %03x:", diag_map[n-1].code);
+		for_each_online_cpu(cpu) {
+			stat = &per_cpu(diag_stat, cpu);
+			seq_printf(m, " %10u", stat->counter[n-1]);
+		}
+		seq_printf(m, "    %s\n", diag_map[n-1].name);
+	}
+	put_online_cpus();
+	return 0;
+}
+
+static void *show_diag_stat_start(struct seq_file *m, loff_t *pos)
+{
+	return *pos <= nr_cpu_ids ? (void *)((unsigned long) *pos + 1) : NULL;
+}
+
+static void *show_diag_stat_next(struct seq_file *m, void *v, loff_t *pos)
+{
+	++*pos;
+	return show_diag_stat_start(m, pos);
+}
+
+static void show_diag_stat_stop(struct seq_file *m, void *v)
+{
+}
+
+static const struct seq_operations show_diag_stat_sops = {
+	.start	= show_diag_stat_start,
+	.next	= show_diag_stat_next,
+	.stop	= show_diag_stat_stop,
+	.show	= show_diag_stat,
+};
+
+static int show_diag_stat_open(struct inode *inode, struct file *file)
+{
+	return seq_open(file, &show_diag_stat_sops);
+}
+
+static const struct file_operations show_diag_stat_fops = {
+	.open		= show_diag_stat_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= seq_release,
+};
+
+
+static int __init show_diag_stat_init(void)
+{
+	debugfs_create_file("diag_stat", 0400, NULL, NULL,
+			    &show_diag_stat_fops);
+	return 0;
+}
+
+device_initcall(show_diag_stat_init);
+
+void diag_stat_inc(enum diag_stat_enum nr)
+{
+	this_cpu_inc(diag_stat.counter[nr]);
+	trace_diagnose(diag_map[nr].code);
+}
+EXPORT_SYMBOL(diag_stat_inc);
+
+void diag_stat_inc_norecursion(enum diag_stat_enum nr)
+{
+	this_cpu_inc(diag_stat.counter[nr]);
+	trace_diagnose_norecursion(diag_map[nr].code);
+}
+EXPORT_SYMBOL(diag_stat_inc_norecursion);
 
 /*
  * Diagnose 14: Input spool file manipulation
  */
-int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode)
+static inline int __diag14(unsigned long rx, unsigned long ry1,
+			   unsigned long subcode)
 {
 	register unsigned long _ry1 asm("2") = ry1;
 	register unsigned long _ry2 asm("3") = subcode;
@@ -29,6 +154,12 @@ int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode)
 
 	return rc;
 }
+
+int diag14(unsigned long rx, unsigned long ry1, unsigned long subcode)
+{
+	diag_stat_inc(DIAG_STAT_X014);
+	return __diag14(rx, ry1, subcode);
+}
 EXPORT_SYMBOL(diag14);
 
 /*
@@ -48,6 +179,7 @@ int diag210(struct diag210 *addr)
 	spin_lock_irqsave(&diag210_lock, flags);
 	diag210_tmp = *addr;
 
+	diag_stat_inc(DIAG_STAT_X210);
 	asm volatile(
 		"	lhi	%0,-1\n"
 		"	sam31\n"
diff --git a/arch/s390/kernel/early.c b/arch/s390/kernel/early.c
index 549a73a4b543..3c31609df959 100644
--- a/arch/s390/kernel/early.c
+++ b/arch/s390/kernel/early.c
@@ -17,6 +17,7 @@
 #include <linux/pfn.h>
 #include <linux/uaccess.h>
 #include <linux/kernel.h>
+#include <asm/diag.h>
 #include <asm/ebcdic.h>
 #include <asm/ipl.h>
 #include <asm/lowcore.h>
@@ -286,6 +287,7 @@ static __init void detect_diag9c(void)
 	int rc;
 
 	cpu_address = stap();
+	diag_stat_inc(DIAG_STAT_X09C);
 	asm volatile(
 		"	diag	%2,0,0x9c\n"
 		"0:	la	%0,0\n"
@@ -300,6 +302,7 @@ static __init void detect_diag44(void)
 {
 	int rc;
 
+	diag_stat_inc(DIAG_STAT_X044);
 	asm volatile(
 		"	diag	0,0,0x44\n"
 		"0:	la	%0,0\n"
@@ -326,9 +329,19 @@ static __init void detect_machine_facilities(void)
 		S390_lowcore.machine_flags |= MACHINE_FLAG_TE;
 	if (test_facility(51))
 		S390_lowcore.machine_flags |= MACHINE_FLAG_TLB_LC;
-	if (test_facility(129))
+	if (test_facility(129)) {
 		S390_lowcore.machine_flags |= MACHINE_FLAG_VX;
+		__ctl_set_bit(0, 17);
+	}
+}
+
+static int __init disable_vector_extension(char *str)
+{
+	S390_lowcore.machine_flags &= ~MACHINE_FLAG_VX;
+	__ctl_clear_bit(0, 17);
+	return 1;
 }
+early_param("novx", disable_vector_extension);
 
 static int __init cad_setup(char *str)
 {
diff --git a/arch/s390/kernel/entry.S b/arch/s390/kernel/entry.S
index 582fe44ab07c..857b6526d298 100644
--- a/arch/s390/kernel/entry.S
+++ b/arch/s390/kernel/entry.S
@@ -20,8 +20,9 @@
 #include <asm/page.h>
 #include <asm/sigp.h>
 #include <asm/irq.h>
-#include <asm/fpu-internal.h>
 #include <asm/vx-insn.h>
+#include <asm/setup.h>
+#include <asm/nmi.h>
 
 __PT_R0      =	__PT_GPRS
 __PT_R1      =	__PT_GPRS + 8
@@ -139,6 +140,28 @@ _PIF_WORK	= (_PIF_PER_TRAP)
 #endif
 	.endm
 
+	/*
+	 * The TSTMSK macro generates a test-under-mask instruction by
+	 * calculating the memory offset for the specified mask value.
+	 * Mask value can be any constant.  The macro shifts the mask
+	 * value to calculate the memory offset for the test-under-mask
+	 * instruction.
+	 */
+	.macro TSTMSK addr, mask, size=8, bytepos=0
+		.if (\bytepos < \size) && (\mask >> 8)
+			.if (\mask & 0xff)
+				.error "Mask exceeds byte boundary"
+			.endif
+			TSTMSK \addr, "(\mask >> 8)", \size, "(\bytepos + 1)"
+			.exitm
+		.endif
+		.ifeq \mask
+			.error "Mask must not be zero"
+		.endif
+		off = \size - \bytepos - 1
+		tm	off+\addr, \mask
+	.endm
+
 	.section .kprobes.text, "ax"
 
 /*
@@ -164,8 +187,11 @@ ENTRY(__switch_to)
 	stg	%r15,__LC_KERNEL_STACK		# store end of kernel stack
 	lg	%r15,__THREAD_ksp(%r1)		# load kernel stack of next
 	lctl	%c4,%c4,__TASK_pid(%r3)		# load pid to control reg. 4
-	mvc	__LC_CURRENT_PID+4(4,%r0),__TASK_pid(%r3) # store pid of next
+	mvc	__LC_CURRENT_PID(4,%r0),__TASK_pid(%r3) # store pid of next
 	lmg	%r6,%r15,__SF_GPRS(%r15)	# load gprs of next task
+	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_LPP
+	bzr	%r14
+	.insn	s,0xb2800000,__LC_LPP		# set program parameter
 	br	%r14
 
 .L__critical_start:
@@ -180,8 +206,8 @@ ENTRY(sie64a)
 	stmg	%r6,%r14,__SF_GPRS(%r15)	# save kernel registers
 	stg	%r2,__SF_EMPTY(%r15)		# save control block pointer
 	stg	%r3,__SF_EMPTY+8(%r15)		# save guest register save area
-	xc	__SF_EMPTY+16(16,%r15),__SF_EMPTY+16(%r15) # host id & reason
-	tm	__LC_CPU_FLAGS+7,_CIF_FPU	# load guest fp/vx registers ?
+	xc	__SF_EMPTY+16(8,%r15),__SF_EMPTY+16(%r15) # reason code = 0
+	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU		# load guest fp/vx registers ?
 	jno	.Lsie_load_guest_gprs
 	brasl	%r14,load_fpu_regs		# load guest fp/vx regs
 .Lsie_load_guest_gprs:
@@ -195,16 +221,9 @@ ENTRY(sie64a)
 	oi	__SIE_PROG0C+3(%r14),1		# we are going into SIE now
 	tm	__SIE_PROG20+3(%r14),3		# last exit...
 	jnz	.Lsie_skip
-	tm	__LC_CPU_FLAGS+7,_CIF_FPU
+	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
 	jo	.Lsie_skip			# exit if fp/vx regs changed
-	tm	__LC_MACHINE_FLAGS+6,0x20	# MACHINE_FLAG_LPP
-	jz	.Lsie_enter
-	.insn	s,0xb2800000,__LC_CURRENT_PID	# set guest id to pid
-.Lsie_enter:
 	sie	0(%r14)
-	tm	__LC_MACHINE_FLAGS+6,0x20	# MACHINE_FLAG_LPP
-	jz	.Lsie_skip
-	.insn	s,0xb2800000,__SF_EMPTY+16(%r15)# set host id
 .Lsie_skip:
 	ni	__SIE_PROG0C+3(%r14),0xfe	# no longer in SIE
 	lctlg	%c1,%c1,__LC_USER_ASCE		# load primary asce
@@ -221,11 +240,11 @@ sie_exit:
 	lg	%r14,__SF_EMPTY+8(%r15)		# load guest register save area
 	stmg	%r0,%r13,0(%r14)		# save guest gprs 0-13
 	lmg	%r6,%r14,__SF_GPRS(%r15)	# restore kernel registers
-	lg	%r2,__SF_EMPTY+24(%r15)		# return exit reason code
+	lg	%r2,__SF_EMPTY+16(%r15)		# return exit reason code
 	br	%r14
 .Lsie_fault:
 	lghi	%r14,-EFAULT
-	stg	%r14,__SF_EMPTY+24(%r15)	# set exit reason code
+	stg	%r14,__SF_EMPTY+16(%r15)	# set exit reason code
 	j	sie_exit
 
 	EX_TABLE(.Lrewind_pad,.Lsie_fault)
@@ -271,7 +290,7 @@ ENTRY(system_call)
 	stg	%r2,__PT_ORIG_GPR2(%r11)
 	stg	%r7,STACK_FRAME_OVERHEAD(%r15)
 	lgf	%r9,0(%r8,%r10)			# get system call add.
-	tm	__TI_flags+7(%r12),_TIF_TRACE
+	TSTMSK	__TI_flags(%r12),_TIF_TRACE
 	jnz	.Lsysc_tracesys
 	basr	%r14,%r9			# call sys_xxxx
 	stg	%r2,__PT_R2(%r11)		# store return value
@@ -279,11 +298,11 @@ ENTRY(system_call)
 .Lsysc_return:
 	LOCKDEP_SYS_EXIT
 .Lsysc_tif:
-	tm	__PT_FLAGS+7(%r11),_PIF_WORK
+	TSTMSK	__PT_FLAGS(%r11),_PIF_WORK
 	jnz	.Lsysc_work
-	tm	__TI_flags+7(%r12),_TIF_WORK
+	TSTMSK	__TI_flags(%r12),_TIF_WORK
 	jnz	.Lsysc_work			# check for work
-	tm	__LC_CPU_FLAGS+7,_CIF_WORK
+	TSTMSK	__LC_CPU_FLAGS,_CIF_WORK
 	jnz	.Lsysc_work
 .Lsysc_restore:
 	lg	%r14,__LC_VDSO_PER_CPU
@@ -299,23 +318,23 @@ ENTRY(system_call)
 # One of the work bits is on. Find out which one.
 #
 .Lsysc_work:
-	tm	__LC_CPU_FLAGS+7,_CIF_MCCK_PENDING
+	TSTMSK	__LC_CPU_FLAGS,_CIF_MCCK_PENDING
 	jo	.Lsysc_mcck_pending
-	tm	__TI_flags+7(%r12),_TIF_NEED_RESCHED
+	TSTMSK	__TI_flags(%r12),_TIF_NEED_RESCHED
 	jo	.Lsysc_reschedule
 #ifdef CONFIG_UPROBES
-	tm	__TI_flags+7(%r12),_TIF_UPROBE
+	TSTMSK	__TI_flags(%r12),_TIF_UPROBE
 	jo	.Lsysc_uprobe_notify
 #endif
-	tm	__PT_FLAGS+7(%r11),_PIF_PER_TRAP
+	TSTMSK	__PT_FLAGS(%r11),_PIF_PER_TRAP
 	jo	.Lsysc_singlestep
-	tm	__TI_flags+7(%r12),_TIF_SIGPENDING
+	TSTMSK	__TI_flags(%r12),_TIF_SIGPENDING
 	jo	.Lsysc_sigpending
-	tm	__TI_flags+7(%r12),_TIF_NOTIFY_RESUME
+	TSTMSK	__TI_flags(%r12),_TIF_NOTIFY_RESUME
 	jo	.Lsysc_notify_resume
-	tm	__LC_CPU_FLAGS+7,_CIF_FPU
+	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
 	jo	.Lsysc_vxrs
-	tm	__LC_CPU_FLAGS+7,_CIF_ASCE
+	TSTMSK	__LC_CPU_FLAGS,_CIF_ASCE
 	jo	.Lsysc_uaccess
 	j	.Lsysc_return		# beware of critical section cleanup
 
@@ -354,7 +373,7 @@ ENTRY(system_call)
 .Lsysc_sigpending:
 	lgr	%r2,%r11		# pass pointer to pt_regs
 	brasl	%r14,do_signal
-	tm	__PT_FLAGS+7(%r11),_PIF_SYSCALL
+	TSTMSK	__PT_FLAGS(%r11),_PIF_SYSCALL
 	jno	.Lsysc_return
 	lmg	%r2,%r7,__PT_R2(%r11)	# load svc arguments
 	lg	%r10,__TI_sysc_table(%r12)	# address of system call table
@@ -414,7 +433,7 @@ ENTRY(system_call)
 	basr	%r14,%r9		# call sys_xxx
 	stg	%r2,__PT_R2(%r11)	# store return value
 .Lsysc_tracenogo:
-	tm	__TI_flags+7(%r12),_TIF_TRACE
+	TSTMSK	__TI_flags(%r12),_TIF_TRACE
 	jz	.Lsysc_return
 	lgr	%r2,%r11		# pass pointer to pt_regs
 	larl	%r14,.Lsysc_return
@@ -544,6 +563,8 @@ ENTRY(io_int_handler)
 	stmg	%r8,%r9,__PT_PSW(%r11)
 	mvc	__PT_INT_CODE(12,%r11),__LC_SUBCHANNEL_ID
 	xc	__PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
+	TSTMSK	__LC_CPU_FLAGS,_CIF_IGNORE_IRQ
+	jo	.Lio_restore
 	TRACE_IRQS_OFF
 	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
 .Lio_loop:
@@ -554,7 +575,7 @@ ENTRY(io_int_handler)
 	lghi	%r3,THIN_INTERRUPT
 .Lio_call:
 	brasl	%r14,do_IRQ
-	tm	__LC_MACHINE_FLAGS+6,0x10	# MACHINE_FLAG_LPAR
+	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_LPAR
 	jz	.Lio_return
 	tpi	0
 	jz	.Lio_return
@@ -564,9 +585,9 @@ ENTRY(io_int_handler)
 	LOCKDEP_SYS_EXIT
 	TRACE_IRQS_ON
 .Lio_tif:
-	tm	__TI_flags+7(%r12),_TIF_WORK
+	TSTMSK	__TI_flags(%r12),_TIF_WORK
 	jnz	.Lio_work		# there is work to do (signals etc.)
-	tm	__LC_CPU_FLAGS+7,_CIF_WORK
+	TSTMSK	__LC_CPU_FLAGS,_CIF_WORK
 	jnz	.Lio_work
 .Lio_restore:
 	lg	%r14,__LC_VDSO_PER_CPU
@@ -594,7 +615,7 @@ ENTRY(io_int_handler)
 	# check for preemptive scheduling
 	icm	%r0,15,__TI_precount(%r12)
 	jnz	.Lio_restore		# preemption is disabled
-	tm	__TI_flags+7(%r12),_TIF_NEED_RESCHED
+	TSTMSK	__TI_flags(%r12),_TIF_NEED_RESCHED
 	jno	.Lio_restore
 	# switch to kernel stack
 	lg	%r1,__PT_R15(%r11)
@@ -626,17 +647,17 @@ ENTRY(io_int_handler)
 # One of the work bits is on. Find out which one.
 #
 .Lio_work_tif:
-	tm	__LC_CPU_FLAGS+7,_CIF_MCCK_PENDING
+	TSTMSK	__LC_CPU_FLAGS,_CIF_MCCK_PENDING
 	jo	.Lio_mcck_pending
-	tm	__TI_flags+7(%r12),_TIF_NEED_RESCHED
+	TSTMSK	__TI_flags(%r12),_TIF_NEED_RESCHED
 	jo	.Lio_reschedule
-	tm	__TI_flags+7(%r12),_TIF_SIGPENDING
+	TSTMSK	__TI_flags(%r12),_TIF_SIGPENDING
 	jo	.Lio_sigpending
-	tm	__TI_flags+7(%r12),_TIF_NOTIFY_RESUME
+	TSTMSK	__TI_flags(%r12),_TIF_NOTIFY_RESUME
 	jo	.Lio_notify_resume
-	tm	__LC_CPU_FLAGS+7,_CIF_FPU
+	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
 	jo	.Lio_vxrs
-	tm	__LC_CPU_FLAGS+7,_CIF_ASCE
+	TSTMSK	__LC_CPU_FLAGS,_CIF_ASCE
 	jo	.Lio_uaccess
 	j	.Lio_return		# beware of critical section cleanup
 
@@ -719,6 +740,8 @@ ENTRY(ext_int_handler)
 	mvc	__PT_INT_PARM(4,%r11),__LC_EXT_PARAMS
 	mvc	__PT_INT_PARM_LONG(8,%r11),0(%r1)
 	xc	__PT_FLAGS(8,%r11),__PT_FLAGS(%r11)
+	TSTMSK	__LC_CPU_FLAGS,_CIF_IGNORE_IRQ
+	jo	.Lio_restore
 	TRACE_IRQS_OFF
 	xc	__SF_BACKCHAIN(8,%r15),__SF_BACKCHAIN(%r15)
 	lgr	%r2,%r11		# pass pointer to pt_regs
@@ -748,27 +771,22 @@ ENTRY(psw_idle)
 	br	%r14
 .Lpsw_idle_end:
 
-/* Store floating-point controls and floating-point or vector extension
- * registers instead.  A critical section cleanup assures that the registers
- * are stored even if interrupted for some other work.	The register %r2
- * designates a struct fpu to store register contents.	If the specified
- * structure does not contain a register save area, the register store is
- * omitted (see also comments in arch_dup_task_struct()).
- *
- * The CIF_FPU flag is set in any case.  The CIF_FPU triggers a lazy restore
- * of the register contents at system call or io return.
+/*
+ * Store floating-point controls and floating-point or vector register
+ * depending whether the vector facility is available.	A critical section
+ * cleanup assures that the registers are stored even if interrupted for
+ * some other work.  The CIF_FPU flag is set to trigger a lazy restore
+ * of the register contents at return from io or a system call.
  */
 ENTRY(save_fpu_regs)
 	lg	%r2,__LC_CURRENT
 	aghi	%r2,__TASK_thread
-	tm	__LC_CPU_FLAGS+7,_CIF_FPU
+	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
 	bor	%r14
 	stfpc	__THREAD_FPU_fpc(%r2)
 .Lsave_fpu_regs_fpc_end:
 	lg	%r3,__THREAD_FPU_regs(%r2)
-	ltgr	%r3,%r3
-	jz	.Lsave_fpu_regs_done	  # no save area -> set CIF_FPU
-	tm	__THREAD_FPU_flags+3(%r2),FPU_USE_VX
+	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_VX
 	jz	.Lsave_fpu_regs_fp	  # no -> store FP regs
 .Lsave_fpu_regs_vx_low:
 	VSTM	%v0,%v15,0,%r3		  # vstm 0,15,0(3)
@@ -797,41 +815,30 @@ ENTRY(save_fpu_regs)
 	br	%r14
 .Lsave_fpu_regs_end:
 
-/* Load floating-point controls and floating-point or vector extension
- * registers.  A critical section cleanup assures that the register contents
- * are loaded even if interrupted for some other work.	Depending on the saved
- * FP/VX state, the vector-enablement control, CR0.46, is either set or cleared.
+/*
+ * Load floating-point controls and floating-point or vector registers.
+ * A critical section cleanup assures that the register contents are
+ * loaded even if interrupted for some other work.
  *
  * There are special calling conventions to fit into sysc and io return work:
  *	%r15:	<kernel stack>
  * The function requires:
- *	%r4 and __SF_EMPTY+32(%r15)
+ *	%r4
  */
 load_fpu_regs:
 	lg	%r4,__LC_CURRENT
 	aghi	%r4,__TASK_thread
-	tm	__LC_CPU_FLAGS+7,_CIF_FPU
+	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
 	bnor	%r14
 	lfpc	__THREAD_FPU_fpc(%r4)
-	stctg	%c0,%c0,__SF_EMPTY+32(%r15)	# store CR0
-	tm	__THREAD_FPU_flags+3(%r4),FPU_USE_VX	# VX-enabled task ?
+	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_VX
 	lg	%r4,__THREAD_FPU_regs(%r4)	# %r4 <- reg save area
-	jz	.Lload_fpu_regs_fp_ctl		# -> no VX, load FP regs
-.Lload_fpu_regs_vx_ctl:
-	tm	__SF_EMPTY+32+5(%r15),2		# test VX control
-	jo	.Lload_fpu_regs_vx
-	oi	__SF_EMPTY+32+5(%r15),2		# set VX control
-	lctlg	%c0,%c0,__SF_EMPTY+32(%r15)
+	jz	.Lload_fpu_regs_fp		# -> no VX, load FP regs
 .Lload_fpu_regs_vx:
 	VLM	%v0,%v15,0,%r4
 .Lload_fpu_regs_vx_high:
 	VLM	%v16,%v31,256,%r4
 	j	.Lload_fpu_regs_done
-.Lload_fpu_regs_fp_ctl:
-	tm	__SF_EMPTY+32+5(%r15),2		# test VX control
-	jz	.Lload_fpu_regs_fp
-	ni	__SF_EMPTY+32+5(%r15),253	# clear VX control
-	lctlg	%c0,%c0,__SF_EMPTY+32(%r15)
 .Lload_fpu_regs_fp:
 	ld	0,0(%r4)
 	ld	1,8(%r4)
@@ -854,16 +861,6 @@ load_fpu_regs:
 	br	%r14
 .Lload_fpu_regs_end:
 
-/* Test and set the vector enablement control in CR0.46 */
-ENTRY(__ctl_set_vx)
-	stctg	%c0,%c0,__SF_EMPTY(%r15)
-	tm	__SF_EMPTY+5(%r15),2
-	bor	%r14
-	oi	__SF_EMPTY+5(%r15),2
-	lctlg	%c0,%c0,__SF_EMPTY(%r15)
-	br	%r14
-.L__ctl_set_vx_end:
-
 .L__critical_end:
 
 /*
@@ -878,11 +875,11 @@ ENTRY(mcck_int_handler)
 	lg	%r12,__LC_THREAD_INFO
 	larl	%r13,cleanup_critical
 	lmg	%r8,%r9,__LC_MCK_OLD_PSW
-	tm	__LC_MCCK_CODE,0x80	# system damage?
+	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_SYSTEM_DAMAGE
 	jo	.Lmcck_panic		# yes -> rest of mcck code invalid
 	lghi	%r14,__LC_CPU_TIMER_SAVE_AREA
 	mvc	__LC_MCCK_ENTER_TIMER(8),0(%r14)
-	tm	__LC_MCCK_CODE+5,0x02	# stored cpu timer value valid?
+	TSTMSK	__LC_MCCK_CODE,MCCK_CODE_CPU_TIMER_VALID
 	jo	3f
 	la	%r14,__LC_SYNC_ENTER_TIMER
 	clc	0(8,%r14),__LC_ASYNC_ENTER_TIMER
@@ -896,7 +893,7 @@ ENTRY(mcck_int_handler)
 	la	%r14,__LC_LAST_UPDATE_TIMER
 2:	spt	0(%r14)
 	mvc	__LC_MCCK_ENTER_TIMER(8),0(%r14)
-3:	tm	__LC_MCCK_CODE+2,0x09	# mwp + ia of old psw valid?
+3:	TSTMSK	__LC_MCCK_CODE,(MCCK_CODE_PSW_MWP_VALID|MCCK_CODE_PSW_IA_VALID)
 	jno	.Lmcck_panic		# no -> skip cleanup critical
 	SWITCH_ASYNC __LC_GPREGS_SAVE_AREA+64,__LC_MCCK_ENTER_TIMER
 .Lmcck_skip:
@@ -916,7 +913,7 @@ ENTRY(mcck_int_handler)
 	la	%r11,STACK_FRAME_OVERHEAD(%r1)
 	lgr	%r15,%r1
 	ssm	__LC_PGM_NEW_PSW	# turn dat on, keep irqs off
-	tm	__LC_CPU_FLAGS+7,_CIF_MCCK_PENDING
+	TSTMSK	__LC_CPU_FLAGS,_CIF_MCCK_PENDING
 	jno	.Lmcck_return
 	TRACE_IRQS_OFF
 	brasl	%r14,s390_handle_mcck
@@ -941,7 +938,10 @@ ENTRY(mcck_int_handler)
 # PSW restart interrupt handler
 #
 ENTRY(restart_int_handler)
-	stg	%r15,__LC_SAVE_AREA_RESTART
+	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_LPP
+	jz	0f
+	.insn	s,0xb2800000,__LC_LPP
+0:	stg	%r15,__LC_SAVE_AREA_RESTART
 	lg	%r15,__LC_RESTART_STACK
 	aghi	%r15,-__PT_SIZE			# create pt_regs on stack
 	xc	0(__PT_SIZE,%r15),0(%r15)
@@ -1019,10 +1019,6 @@ cleanup_critical:
 	jl	0f
 	clg	%r9,BASED(.Lcleanup_table+104)	# .Lload_fpu_regs_end
 	jl	.Lcleanup_load_fpu_regs
-	clg	%r9,BASED(.Lcleanup_table+112)	# __ctl_set_vx
-	jl	0f
-	clg	%r9,BASED(.Lcleanup_table+120)	# .L__ctl_set_vx_end
-	jl	.Lcleanup___ctl_set_vx
 0:	br	%r14
 
 	.align	8
@@ -1041,8 +1037,6 @@ cleanup_critical:
 	.quad	.Lsave_fpu_regs_end
 	.quad	load_fpu_regs
 	.quad	.Lload_fpu_regs_end
-	.quad	__ctl_set_vx
-	.quad	.L__ctl_set_vx_end
 
 #if IS_ENABLED(CONFIG_KVM)
 .Lcleanup_table_sie:
@@ -1051,10 +1045,7 @@ cleanup_critical:
 
 .Lcleanup_sie:
 	lg	%r9,__SF_EMPTY(%r15)		# get control block pointer
-	tm	__LC_MACHINE_FLAGS+6,0x20	# MACHINE_FLAG_LPP
-	jz	0f
-	.insn	s,0xb2800000,__SF_EMPTY+16(%r15)# set host id
-0:	ni	__SIE_PROG0C+3(%r9),0xfe	# no longer in SIE
+	ni	__SIE_PROG0C+3(%r9),0xfe	# no longer in SIE
 	lctlg	%c1,%c1,__LC_USER_ASCE		# load primary asce
 	larl	%r9,sie_exit			# skip forward to sie_exit
 	br	%r14
@@ -1206,7 +1197,7 @@ cleanup_critical:
 	.quad	.Lpsw_idle_lpsw
 
 .Lcleanup_save_fpu_regs:
-	tm	__LC_CPU_FLAGS+7,_CIF_FPU
+	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
 	bor	%r14
 	clg	%r9,BASED(.Lcleanup_save_fpu_regs_done)
 	jhe	5f
@@ -1224,9 +1215,7 @@ cleanup_critical:
 	stfpc	__THREAD_FPU_fpc(%r2)
 1:	# Load register save area and check if VX is active
 	lg	%r3,__THREAD_FPU_regs(%r2)
-	ltgr	%r3,%r3
-	jz	5f			  # no save area -> set CIF_FPU
-	tm	__THREAD_FPU_flags+3(%r2),FPU_USE_VX
+	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_VX
 	jz	4f			  # no VX -> store FP regs
 2:	# Store vector registers (V0-V15)
 	VSTM	%v0,%v15,0,%r3		  # vstm 0,15,0(3)
@@ -1266,43 +1255,27 @@ cleanup_critical:
 	.quad	.Lsave_fpu_regs_done
 
 .Lcleanup_load_fpu_regs:
-	tm	__LC_CPU_FLAGS+7,_CIF_FPU
+	TSTMSK	__LC_CPU_FLAGS,_CIF_FPU
 	bnor	%r14
 	clg	%r9,BASED(.Lcleanup_load_fpu_regs_done)
 	jhe	1f
 	clg	%r9,BASED(.Lcleanup_load_fpu_regs_fp)
 	jhe	2f
-	clg	%r9,BASED(.Lcleanup_load_fpu_regs_fp_ctl)
-	jhe	3f
 	clg	%r9,BASED(.Lcleanup_load_fpu_regs_vx_high)
-	jhe	4f
+	jhe	3f
 	clg	%r9,BASED(.Lcleanup_load_fpu_regs_vx)
-	jhe	5f
-	clg	%r9,BASED(.Lcleanup_load_fpu_regs_vx_ctl)
-	jhe	6f
+	jhe	4f
 	lg	%r4,__LC_CURRENT
 	aghi	%r4,__TASK_thread
 	lfpc	__THREAD_FPU_fpc(%r4)
-	tm	__THREAD_FPU_flags+3(%r4),FPU_USE_VX	# VX-enabled task ?
+	TSTMSK	__LC_MACHINE_FLAGS,MACHINE_FLAG_VX
 	lg	%r4,__THREAD_FPU_regs(%r4)	# %r4 <- reg save area
-	jz	3f				# -> no VX, load FP regs
-6:	# Set VX-enablement control
-	stctg	%c0,%c0,__SF_EMPTY+32(%r15)	# store CR0
-	tm	__SF_EMPTY+32+5(%r15),2		# test VX control
-	jo	5f
-	oi	__SF_EMPTY+32+5(%r15),2		# set VX control
-	lctlg	%c0,%c0,__SF_EMPTY+32(%r15)
-5:	# Load V0 ..V15 registers
+	jz	2f				# -> no VX, load FP regs
+4:	# Load V0 ..V15 registers
 	VLM	%v0,%v15,0,%r4
-4:	# Load V16..V31 registers
+3:	# Load V16..V31 registers
 	VLM	%v16,%v31,256,%r4
 	j	1f
-3:	# Clear VX-enablement control for FP
-	stctg	%c0,%c0,__SF_EMPTY+32(%r15)	# store CR0
-	tm	__SF_EMPTY+32+5(%r15),2		# test VX control
-	jz	2f
-	ni	__SF_EMPTY+32+5(%r15),253	# clear VX control
-	lctlg	%c0,%c0,__SF_EMPTY+32(%r15)
 2:	# Load floating-point registers
 	ld	0,0(%r4)
 	ld	1,8(%r4)
@@ -1324,28 +1297,15 @@ cleanup_critical:
 	ni	__LC_CPU_FLAGS+7,255-_CIF_FPU
 	lg	%r9,48(%r11)		# return from load_fpu_regs
 	br	%r14
-.Lcleanup_load_fpu_regs_vx_ctl:
-	.quad	.Lload_fpu_regs_vx_ctl
 .Lcleanup_load_fpu_regs_vx:
 	.quad	.Lload_fpu_regs_vx
 .Lcleanup_load_fpu_regs_vx_high:
 	.quad	.Lload_fpu_regs_vx_high
-.Lcleanup_load_fpu_regs_fp_ctl:
-	.quad	.Lload_fpu_regs_fp_ctl
 .Lcleanup_load_fpu_regs_fp:
 	.quad	.Lload_fpu_regs_fp
 .Lcleanup_load_fpu_regs_done:
 	.quad	.Lload_fpu_regs_done
 
-.Lcleanup___ctl_set_vx:
-	stctg	%c0,%c0,__SF_EMPTY(%r15)
-	tm	__SF_EMPTY+5(%r15),2
-	bor	%r14
-	oi	__SF_EMPTY+5(%r15),2
-	lctlg	%c0,%c0,__SF_EMPTY(%r15)
-	lg	%r9,48(%r11)		# return from __ctl_set_vx
-	br	%r14
-
 /*
  * Integer constants
  */
diff --git a/arch/s390/kernel/entry.h b/arch/s390/kernel/entry.h
index 834df047d35f..b7019ab74070 100644
--- a/arch/s390/kernel/entry.h
+++ b/arch/s390/kernel/entry.h
@@ -16,13 +16,10 @@ void io_int_handler(void);
 void mcck_int_handler(void);
 void restart_int_handler(void);
 void restart_call_handler(void);
-void psw_idle(struct s390_idle_data *, unsigned long);
 
 asmlinkage long do_syscall_trace_enter(struct pt_regs *regs);
 asmlinkage void do_syscall_trace_exit(struct pt_regs *regs);
 
-int alloc_vector_registers(struct task_struct *tsk);
-
 void do_protection_exception(struct pt_regs *regs);
 void do_dat_exception(struct pt_regs *regs);
 
diff --git a/arch/s390/kernel/head64.S b/arch/s390/kernel/head64.S
index d7c00507568a..58b719fa8067 100644
--- a/arch/s390/kernel/head64.S
+++ b/arch/s390/kernel/head64.S
@@ -16,7 +16,12 @@
 
 __HEAD
 ENTRY(startup_continue)
-	larl	%r1,sched_clock_base_cc
+	tm	__LC_STFL_FAC_LIST+6,0x80	# LPP available ?
+	jz	0f
+	xc	__LC_LPP+1(7,0),__LC_LPP+1	# clear lpp and current_pid
+	mvi	__LC_LPP,0x80			#   and set LPP_MAGIC
+	.insn	s,0xb2800000,__LC_LPP		# load program parameter
+0:	larl	%r1,sched_clock_base_cc
 	mvc	0(8,%r1),__LC_LAST_UPDATE_CLOCK
 	larl	%r13,.LPG1		# get base
 	lctlg	%c0,%c15,.Lctl-.LPG1(%r13)	# load control registers
diff --git a/arch/s390/kernel/ipl.c b/arch/s390/kernel/ipl.c
index 52fbef91d1d9..f6d8acd7e136 100644
--- a/arch/s390/kernel/ipl.c
+++ b/arch/s390/kernel/ipl.c
@@ -17,6 +17,7 @@
 #include <linux/gfp.h>
 #include <linux/crash_dump.h>
 #include <linux/debug_locks.h>
+#include <asm/diag.h>
 #include <asm/ipl.h>
 #include <asm/smp.h>
 #include <asm/setup.h>
@@ -165,7 +166,7 @@ static struct ipl_parameter_block *dump_block_ccw;
 
 static struct sclp_ipl_info sclp_ipl_info;
 
-int diag308(unsigned long subcode, void *addr)
+static inline int __diag308(unsigned long subcode, void *addr)
 {
 	register unsigned long _addr asm("0") = (unsigned long) addr;
 	register unsigned long _rc asm("1") = 0;
@@ -178,6 +179,12 @@ int diag308(unsigned long subcode, void *addr)
 		: "d" (subcode) : "cc", "memory");
 	return _rc;
 }
+
+int diag308(unsigned long subcode, void *addr)
+{
+	diag_stat_inc(DIAG_STAT_X308);
+	return __diag308(subcode, addr);
+}
 EXPORT_SYMBOL_GPL(diag308);
 
 /* SYSFS */
diff --git a/arch/s390/kernel/irq.c b/arch/s390/kernel/irq.c
index e9d9addfaa44..f41d5208aaf7 100644
--- a/arch/s390/kernel/irq.c
+++ b/arch/s390/kernel/irq.c
@@ -69,7 +69,6 @@ static const struct irq_class irqclass_sub_desc[] = {
 	{.irq = IRQEXT_IUC, .name = "IUC", .desc = "[EXT] IUCV"},
 	{.irq = IRQEXT_CMS, .name = "CMS", .desc = "[EXT] CPU-Measurement: Sampling"},
 	{.irq = IRQEXT_CMC, .name = "CMC", .desc = "[EXT] CPU-Measurement: Counter"},
-	{.irq = IRQEXT_CMR, .name = "CMR", .desc = "[EXT] CPU-Measurement: RI"},
 	{.irq = IRQEXT_FTP, .name = "FTP", .desc = "[EXT] HMC FTP Service"},
 	{.irq = IRQIO_CIO,  .name = "CIO", .desc = "[I/O] Common I/O Layer Interrupt"},
 	{.irq = IRQIO_QAI,  .name = "QAI", .desc = "[I/O] QDIO Adapter Interrupt"},
diff --git a/arch/s390/kernel/nmi.c b/arch/s390/kernel/nmi.c
index 0ae6f8e74840..07302ce37648 100644
--- a/arch/s390/kernel/nmi.c
+++ b/arch/s390/kernel/nmi.c
@@ -21,19 +21,20 @@
 #include <asm/nmi.h>
 #include <asm/crw.h>
 #include <asm/switch_to.h>
-#include <asm/fpu-internal.h>
 #include <asm/ctl_reg.h>
 
 struct mcck_struct {
-	int kill_task;
-	int channel_report;
-	int warning;
-	unsigned long long mcck_code;
+	unsigned int kill_task : 1;
+	unsigned int channel_report : 1;
+	unsigned int warning : 1;
+	unsigned int etr_queue : 1;
+	unsigned int stp_queue : 1;
+	unsigned long mcck_code;
 };
 
 static DEFINE_PER_CPU(struct mcck_struct, cpu_mcck);
 
-static void s390_handle_damage(char *msg)
+static void s390_handle_damage(void)
 {
 	smp_send_stop();
 	disabled_wait((unsigned long) __builtin_return_address(0));
@@ -81,10 +82,14 @@ void s390_handle_mcck(void)
 		if (xchg(&mchchk_wng_posted, 1) == 0)
 			kill_cad_pid(SIGPWR, 1);
 	}
+	if (mcck.etr_queue)
+		etr_queue_work();
+	if (mcck.stp_queue)
+		stp_queue_work();
 	if (mcck.kill_task) {
 		local_irq_enable();
 		printk(KERN_EMERG "mcck: Terminating task because of machine "
-		       "malfunction (code 0x%016llx).\n", mcck.mcck_code);
+		       "malfunction (code 0x%016lx).\n", mcck.mcck_code);
 		printk(KERN_EMERG "mcck: task: %s, pid: %d.\n",
 		       current->comm, current->pid);
 		do_exit(SIGSEGV);
@@ -96,7 +101,7 @@ EXPORT_SYMBOL_GPL(s390_handle_mcck);
  * returns 0 if all registers could be validated
  * returns 1 otherwise
  */
-static int notrace s390_revalidate_registers(struct mci *mci)
+static int notrace s390_validate_registers(union mci mci)
 {
 	int kill_task;
 	u64 zero;
@@ -105,14 +110,14 @@ static int notrace s390_revalidate_registers(struct mci *mci)
 	kill_task = 0;
 	zero = 0;
 
-	if (!mci->gr) {
+	if (!mci.gr) {
 		/*
 		 * General purpose registers couldn't be restored and have
 		 * unknown contents. Process needs to be terminated.
 		 */
 		kill_task = 1;
 	}
-	if (!mci->fp) {
+	if (!mci.fp) {
 		/*
 		 * Floating point registers can't be restored and
 		 * therefore the process needs to be terminated.
@@ -121,7 +126,7 @@ static int notrace s390_revalidate_registers(struct mci *mci)
 	}
 	fpt_save_area = &S390_lowcore.floating_pt_save_area;
 	fpt_creg_save_area = &S390_lowcore.fpt_creg_save_area;
-	if (!mci->fc) {
+	if (!mci.fc) {
 		/*
 		 * Floating point control register can't be restored.
 		 * Task will be terminated.
@@ -132,7 +137,7 @@ static int notrace s390_revalidate_registers(struct mci *mci)
 		asm volatile("lfpc 0(%0)" : : "a" (fpt_creg_save_area));
 
 	if (!MACHINE_HAS_VX) {
-		/* Revalidate floating point registers */
+		/* Validate floating point registers */
 		asm volatile(
 			"	ld	0,0(%0)\n"
 			"	ld	1,8(%0)\n"
@@ -152,10 +157,10 @@ static int notrace s390_revalidate_registers(struct mci *mci)
 			"	ld	15,120(%0)\n"
 			: : "a" (fpt_save_area));
 	} else {
-		/* Revalidate vector registers */
+		/* Validate vector registers */
 		union ctlreg0 cr0;
 
-		if (!mci->vr) {
+		if (!mci.vr) {
 			/*
 			 * Vector registers can't be restored and therefore
 			 * the process needs to be terminated.
@@ -173,38 +178,38 @@ static int notrace s390_revalidate_registers(struct mci *mci)
 				 &S390_lowcore.vector_save_area) : "1");
 		__ctl_load(S390_lowcore.cregs_save_area[0], 0, 0);
 	}
-	/* Revalidate access registers */
+	/* Validate access registers */
 	asm volatile(
 		"	lam	0,15,0(%0)"
 		: : "a" (&S390_lowcore.access_regs_save_area));
-	if (!mci->ar) {
+	if (!mci.ar) {
 		/*
 		 * Access registers have unknown contents.
 		 * Terminating task.
 		 */
 		kill_task = 1;
 	}
-	/* Revalidate control registers */
-	if (!mci->cr) {
+	/* Validate control registers */
+	if (!mci.cr) {
 		/*
 		 * Control registers have unknown contents.
 		 * Can't recover and therefore stopping machine.
 		 */
-		s390_handle_damage("invalid control registers.");
+		s390_handle_damage();
 	} else {
 		asm volatile(
 			"	lctlg	0,15,0(%0)"
 			: : "a" (&S390_lowcore.cregs_save_area));
 	}
 	/*
-	 * We don't even try to revalidate the TOD register, since we simply
+	 * We don't even try to validate the TOD register, since we simply
 	 * can't write something sensible into that register.
 	 */
 	/*
-	 * See if we can revalidate the TOD programmable register with its
+	 * See if we can validate the TOD programmable register with its
 	 * old contents (should be zero) otherwise set it to zero.
 	 */
-	if (!mci->pr)
+	if (!mci.pr)
 		asm volatile(
 			"	sr	0,0\n"
 			"	sckpf"
@@ -215,17 +220,17 @@ static int notrace s390_revalidate_registers(struct mci *mci)
 			"	sckpf"
 			: : "a" (&S390_lowcore.tod_progreg_save_area)
 			: "0", "cc");
-	/* Revalidate clock comparator register */
+	/* Validate clock comparator register */
 	set_clock_comparator(S390_lowcore.clock_comparator);
 	/* Check if old PSW is valid */
-	if (!mci->wp)
+	if (!mci.wp)
 		/*
 		 * Can't tell if we come from user or kernel mode
 		 * -> stopping machine.
 		 */
-		s390_handle_damage("old psw invalid.");
+		s390_handle_damage();
 
-	if (!mci->ms || !mci->pm || !mci->ia)
+	if (!mci.ms || !mci.pm || !mci.ia)
 		kill_task = 1;
 
 	return kill_task;
@@ -249,21 +254,21 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
 	static unsigned long long last_ipd;
 	struct mcck_struct *mcck;
 	unsigned long long tmp;
-	struct mci *mci;
+	union mci mci;
 	int umode;
 
 	nmi_enter();
 	inc_irq_stat(NMI_NMI);
-	mci = (struct mci *) &S390_lowcore.mcck_interruption_code;
+	mci.val = S390_lowcore.mcck_interruption_code;
 	mcck = this_cpu_ptr(&cpu_mcck);
 	umode = user_mode(regs);
 
-	if (mci->sd) {
+	if (mci.sd) {
 		/* System damage -> stopping machine */
-		s390_handle_damage("received system damage machine check.");
+		s390_handle_damage();
 	}
-	if (mci->pd) {
-		if (mci->b) {
+	if (mci.pd) {
+		if (mci.b) {
 			/* Processing backup -> verify if we can survive this */
 			u64 z_mcic, o_mcic, t_mcic;
 			z_mcic = (1ULL<<63 | 1ULL<<59 | 1ULL<<29);
@@ -271,12 +276,11 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
 				  1ULL<<36 | 1ULL<<35 | 1ULL<<34 | 1ULL<<32 |
 				  1ULL<<30 | 1ULL<<21 | 1ULL<<20 | 1ULL<<17 |
 				  1ULL<<16);
-			t_mcic = *(u64 *)mci;
+			t_mcic = mci.val;
 
 			if (((t_mcic & z_mcic) != 0) ||
 			    ((t_mcic & o_mcic) != o_mcic)) {
-				s390_handle_damage("processing backup machine "
-						   "check with damage.");
+				s390_handle_damage();
 			}
 
 			/*
@@ -291,64 +295,62 @@ void notrace s390_do_machine_check(struct pt_regs *regs)
 				ipd_count = 1;
 			last_ipd = tmp;
 			if (ipd_count == MAX_IPD_COUNT)
-				s390_handle_damage("too many ipd retries.");
+				s390_handle_damage();
 			spin_unlock(&ipd_lock);
 		} else {
 			/* Processing damage -> stopping machine */
-			s390_handle_damage("received instruction processing "
-					   "damage machine check.");
+			s390_handle_damage();
 		}
 	}
-	if (s390_revalidate_registers(mci)) {
+	if (s390_validate_registers(mci)) {
 		if (umode) {
 			/*
 			 * Couldn't restore all register contents while in
 			 * user mode -> mark task for termination.
 			 */
 			mcck->kill_task = 1;
-			mcck->mcck_code = *(unsigned long long *) mci;
+			mcck->mcck_code = mci.val;
 			set_cpu_flag(CIF_MCCK_PENDING);
 		} else {
 			/*
 			 * Couldn't restore all register contents while in
 			 * kernel mode -> stopping machine.
 			 */
-			s390_handle_damage("unable to revalidate registers.");
+			s390_handle_damage();
 		}
 	}
-	if (mci->cd) {
+	if (mci.cd) {
 		/* Timing facility damage */
-		s390_handle_damage("TOD clock damaged");
+		s390_handle_damage();
 	}
-	if (mci->ed && mci->ec) {
+	if (mci.ed && mci.ec) {
 		/* External damage */
 		if (S390_lowcore.external_damage_code & (1U << ED_ETR_SYNC))
-			etr_sync_check();
+			mcck->etr_queue |= etr_sync_check();
 		if (S390_lowcore.external_damage_code & (1U << ED_ETR_SWITCH))
-			etr_switch_to_local();
+			mcck->etr_queue |= etr_switch_to_local();
 		if (S390_lowcore.external_damage_code & (1U << ED_STP_SYNC))
-			stp_sync_check();
+			mcck->stp_queue |= stp_sync_check();
 		if (S390_lowcore.external_damage_code & (1U << ED_STP_ISLAND))
-			stp_island_check();
+			mcck->stp_queue |= stp_island_check();
+		if (mcck->etr_queue || mcck->stp_queue)
+			set_cpu_flag(CIF_MCCK_PENDING);
 	}
-	if (mci->se)
+	if (mci.se)
 		/* Storage error uncorrected */
-		s390_handle_damage("received storage error uncorrected "
-				   "machine check.");
-	if (mci->ke)
+		s390_handle_damage();
+	if (mci.ke)
 		/* Storage key-error uncorrected */
-		s390_handle_damage("received storage key-error uncorrected "
-				   "machine check.");
-	if (mci->ds && mci->fa)
+		s390_handle_damage();
+	if (mci.ds && mci.fa)
 		/* Storage degradation */
-		s390_handle_damage("received storage degradation machine "
-				   "check.");
-	if (mci->cp) {
+		s390_handle_damage();
+	if (mci.cp) {
 		/* Channel report word pending */
 		mcck->channel_report = 1;
 		set_cpu_flag(CIF_MCCK_PENDING);
 	}
-	if (mci->w) {
+	if (mci.w) {
 		/* Warning pending */
 		mcck->warning = 1;
 		set_cpu_flag(CIF_MCCK_PENDING);
diff --git a/arch/s390/kernel/perf_cpum_sf.c b/arch/s390/kernel/perf_cpum_sf.c
index b973972f6ba5..3d8da1e742c2 100644
--- a/arch/s390/kernel/perf_cpum_sf.c
+++ b/arch/s390/kernel/perf_cpum_sf.c
@@ -1019,11 +1019,13 @@ static int perf_push_sample(struct perf_event *event, struct sf_raw_sample *sfr)
 		break;
 	}
 
-	/* The host-program-parameter (hpp) contains the pid of
-	 * the CPU thread as set by sie64a() in entry.S.
-	 * If non-zero assume a guest sample.
+	/*
+	 * A non-zero guest program parameter indicates a guest
+	 * sample.
+	 * Note that some early samples might be misaccounted to
+	 * the host.
 	 */
-	if (sfr->basic.hpp)
+	if (sfr->basic.gpp)
 		sde_regs->in_guest = 1;
 
 	overflow = 0;
diff --git a/arch/s390/kernel/process.c b/arch/s390/kernel/process.c
index f2dac9f0799d..688a3aad9c79 100644
--- a/arch/s390/kernel/process.c
+++ b/arch/s390/kernel/process.c
@@ -23,6 +23,7 @@
 #include <linux/kprobes.h>
 #include <linux/random.h>
 #include <linux/module.h>
+#include <linux/init_task.h>
 #include <asm/io.h>
 #include <asm/processor.h>
 #include <asm/vtimer.h>
@@ -36,6 +37,9 @@
 
 asmlinkage void ret_from_fork(void) asm ("ret_from_fork");
 
+/* FPU save area for the init task */
+__vector128 init_task_fpu_regs[__NUM_VXRS] __init_task_data;
+
 /*
  * Return saved PC of a blocked thread. used in kernel/sched.
  * resume in entry.S does not create a new stack frame, it
@@ -87,31 +91,29 @@ void arch_release_task_struct(struct task_struct *tsk)
 
 int arch_dup_task_struct(struct task_struct *dst, struct task_struct *src)
 {
+	size_t fpu_regs_size;
+
 	*dst = *src;
 
-	/* Set up a new floating-point register save area */
-	dst->thread.fpu.fpc = 0;
-	dst->thread.fpu.flags = 0;	/* Always start with VX disabled */
-	dst->thread.fpu.fprs = kzalloc(sizeof(freg_t) * __NUM_FPRS,
-				       GFP_KERNEL|__GFP_REPEAT);
-	if (!dst->thread.fpu.fprs)
+	/*
+	 * If the vector extension is available, it is enabled for all tasks,
+	 * and, thus, the FPU register save area must be allocated accordingly.
+	 */
+	fpu_regs_size = MACHINE_HAS_VX ? sizeof(__vector128) * __NUM_VXRS
+				       : sizeof(freg_t) * __NUM_FPRS;
+	dst->thread.fpu.regs = kzalloc(fpu_regs_size, GFP_KERNEL|__GFP_REPEAT);
+	if (!dst->thread.fpu.regs)
 		return -ENOMEM;
 
 	/*
 	 * Save the floating-point or vector register state of the current
-	 * task.  The state is not saved for early kernel threads, for example,
-	 * the init_task, which do not have an allocated save area.
-	 * The CIF_FPU flag is set in any case to lazy clear or restore a saved
-	 * state when switching to a different task or returning to user space.
+	 * task and set the CIF_FPU flag to lazy restore the FPU register
+	 * state when returning to user space.
 	 */
 	save_fpu_regs();
 	dst->thread.fpu.fpc = current->thread.fpu.fpc;
-	if (is_vx_task(current))
-		convert_vx_to_fp(dst->thread.fpu.fprs,
-				 current->thread.fpu.vxrs);
-	else
-		memcpy(dst->thread.fpu.fprs, current->thread.fpu.fprs,
-		       sizeof(freg_t) * __NUM_FPRS);
+	memcpy(dst->thread.fpu.regs, current->thread.fpu.regs, fpu_regs_size);
+
 	return 0;
 }
 
@@ -169,7 +171,6 @@ int copy_thread(unsigned long clone_flags, unsigned long new_stackp,
 
 	/* Don't copy runtime instrumentation info */
 	p->thread.ri_cb = NULL;
-	p->thread.ri_signum = 0;
 	frame->childregs.psw.mask &= ~PSW_MASK_RI;
 
 	/* Set a new TLS ?  */
@@ -199,7 +200,7 @@ int dump_fpu (struct pt_regs * regs, s390_fp_regs *fpregs)
 	save_fpu_regs();
 	fpregs->fpc = current->thread.fpu.fpc;
 	fpregs->pad = 0;
-	if (is_vx_task(current))
+	if (MACHINE_HAS_VX)
 		convert_vx_to_fp((freg_t *)&fpregs->fprs,
 				 current->thread.fpu.vxrs);
 	else
diff --git a/arch/s390/kernel/processor.c b/arch/s390/kernel/processor.c
index e6e077ae3990..7ce00e7a709a 100644
--- a/arch/s390/kernel/processor.c
+++ b/arch/s390/kernel/processor.c
@@ -11,6 +11,7 @@
 #include <linux/seq_file.h>
 #include <linux/delay.h>
 #include <linux/cpu.h>
+#include <asm/diag.h>
 #include <asm/elf.h>
 #include <asm/lowcore.h>
 #include <asm/param.h>
@@ -20,8 +21,10 @@ static DEFINE_PER_CPU(struct cpuid, cpu_id);
 
 void notrace cpu_relax(void)
 {
-	if (!smp_cpu_mtid && MACHINE_HAS_DIAG44)
+	if (!smp_cpu_mtid && MACHINE_HAS_DIAG44) {
+		diag_stat_inc(DIAG_STAT_X044);
 		asm volatile("diag 0,0,0x44");
+	}
 	barrier();
 }
 EXPORT_SYMBOL(cpu_relax);
diff --git a/arch/s390/kernel/ptrace.c b/arch/s390/kernel/ptrace.c
index 8b1c8e33f184..01c37b36caf9 100644
--- a/arch/s390/kernel/ptrace.c
+++ b/arch/s390/kernel/ptrace.c
@@ -239,12 +239,12 @@ static unsigned long __peek_user(struct task_struct *child, addr_t addr)
 		 * or the child->thread.fpu.vxrs array
 		 */
 		offset = addr - (addr_t) &dummy->regs.fp_regs.fprs;
-		if (is_vx_task(child))
+		if (MACHINE_HAS_VX)
 			tmp = *(addr_t *)
 			       ((addr_t) child->thread.fpu.vxrs + 2*offset);
 		else
 			tmp = *(addr_t *)
-			       ((addr_t) &child->thread.fpu.fprs + offset);
+			       ((addr_t) child->thread.fpu.fprs + offset);
 
 	} else if (addr < (addr_t) (&dummy->regs.per_info + 1)) {
 		/*
@@ -383,12 +383,12 @@ static int __poke_user(struct task_struct *child, addr_t addr, addr_t data)
 		 * or the child->thread.fpu.vxrs array
 		 */
 		offset = addr - (addr_t) &dummy->regs.fp_regs.fprs;
-		if (is_vx_task(child))
+		if (MACHINE_HAS_VX)
 			*(addr_t *)((addr_t)
 				child->thread.fpu.vxrs + 2*offset) = data;
 		else
 			*(addr_t *)((addr_t)
-				&child->thread.fpu.fprs + offset) = data;
+				child->thread.fpu.fprs + offset) = data;
 
 	} else if (addr < (addr_t) (&dummy->regs.per_info + 1)) {
 		/*
@@ -617,12 +617,12 @@ static u32 __peek_user_compat(struct task_struct *child, addr_t addr)
 		 * or the child->thread.fpu.vxrs array
 		 */
 		offset = addr - (addr_t) &dummy32->regs.fp_regs.fprs;
-		if (is_vx_task(child))
+		if (MACHINE_HAS_VX)
 			tmp = *(__u32 *)
 			       ((addr_t) child->thread.fpu.vxrs + 2*offset);
 		else
 			tmp = *(__u32 *)
-			       ((addr_t) &child->thread.fpu.fprs + offset);
+			       ((addr_t) child->thread.fpu.fprs + offset);
 
 	} else if (addr < (addr_t) (&dummy32->regs.per_info + 1)) {
 		/*
@@ -742,12 +742,12 @@ static int __poke_user_compat(struct task_struct *child,
 		 * or the child->thread.fpu.vxrs array
 		 */
 		offset = addr - (addr_t) &dummy32->regs.fp_regs.fprs;
-		if (is_vx_task(child))
+		if (MACHINE_HAS_VX)
 			*(__u32 *)((addr_t)
 				child->thread.fpu.vxrs + 2*offset) = tmp;
 		else
 			*(__u32 *)((addr_t)
-				&child->thread.fpu.fprs + offset) = tmp;
+				child->thread.fpu.fprs + offset) = tmp;
 
 	} else if (addr < (addr_t) (&dummy32->regs.per_info + 1)) {
 		/*
@@ -981,7 +981,7 @@ static int s390_fpregs_set(struct task_struct *target,
 	if (rc)
 		return rc;
 
-	if (is_vx_task(target))
+	if (MACHINE_HAS_VX)
 		convert_fp_to_vx(target->thread.fpu.vxrs, fprs);
 	else
 		memcpy(target->thread.fpu.fprs, &fprs, sizeof(fprs));
@@ -1047,13 +1047,10 @@ static int s390_vxrs_low_get(struct task_struct *target,
 
 	if (!MACHINE_HAS_VX)
 		return -ENODEV;
-	if (is_vx_task(target)) {
-		if (target == current)
-			save_fpu_regs();
-		for (i = 0; i < __NUM_VXRS_LOW; i++)
-			vxrs[i] = *((__u64 *)(target->thread.fpu.vxrs + i) + 1);
-	} else
-		memset(vxrs, 0, sizeof(vxrs));
+	if (target == current)
+		save_fpu_regs();
+	for (i = 0; i < __NUM_VXRS_LOW; i++)
+		vxrs[i] = *((__u64 *)(target->thread.fpu.vxrs + i) + 1);
 	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1);
 }
 
@@ -1067,11 +1064,7 @@ static int s390_vxrs_low_set(struct task_struct *target,
 
 	if (!MACHINE_HAS_VX)
 		return -ENODEV;
-	if (!is_vx_task(target)) {
-		rc = alloc_vector_registers(target);
-		if (rc)
-			return rc;
-	} else if (target == current)
+	if (target == current)
 		save_fpu_regs();
 
 	rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1);
@@ -1091,13 +1084,10 @@ static int s390_vxrs_high_get(struct task_struct *target,
 
 	if (!MACHINE_HAS_VX)
 		return -ENODEV;
-	if (is_vx_task(target)) {
-		if (target == current)
-			save_fpu_regs();
-		memcpy(vxrs, target->thread.fpu.vxrs + __NUM_VXRS_LOW,
-		       sizeof(vxrs));
-	} else
-		memset(vxrs, 0, sizeof(vxrs));
+	if (target == current)
+		save_fpu_regs();
+	memcpy(vxrs, target->thread.fpu.vxrs + __NUM_VXRS_LOW, sizeof(vxrs));
+
 	return user_regset_copyout(&pos, &count, &kbuf, &ubuf, vxrs, 0, -1);
 }
 
@@ -1110,11 +1100,7 @@ static int s390_vxrs_high_set(struct task_struct *target,
 
 	if (!MACHINE_HAS_VX)
 		return -ENODEV;
-	if (!is_vx_task(target)) {
-		rc = alloc_vector_registers(target);
-		if (rc)
-			return rc;
-	} else if (target == current)
+	if (target == current)
 		save_fpu_regs();
 
 	rc = user_regset_copyin(&pos, &count, &kbuf, &ubuf,
diff --git a/arch/s390/kernel/runtime_instr.c b/arch/s390/kernel/runtime_instr.c
index 26b4ae96fdd7..fffa0e5462af 100644
--- a/arch/s390/kernel/runtime_instr.c
+++ b/arch/s390/kernel/runtime_instr.c
@@ -18,11 +18,6 @@
 /* empty control block to disable RI by loading it */
 struct runtime_instr_cb runtime_instr_empty_cb;
 
-static int runtime_instr_avail(void)
-{
-	return test_facility(64);
-}
-
 static void disable_runtime_instr(void)
 {
 	struct pt_regs *regs = task_pt_regs(current);
@@ -40,7 +35,6 @@ static void disable_runtime_instr(void)
 static void init_runtime_instr_cb(struct runtime_instr_cb *cb)
 {
 	cb->buf_limit = 0xfff;
-	cb->int_requested = 1;
 	cb->pstate = 1;
 	cb->pstate_set_buf = 1;
 	cb->pstate_sample = 1;
@@ -57,46 +51,14 @@ void exit_thread_runtime_instr(void)
 		return;
 	disable_runtime_instr();
 	kfree(task->thread.ri_cb);
-	task->thread.ri_signum = 0;
 	task->thread.ri_cb = NULL;
 }
 
-static void runtime_instr_int_handler(struct ext_code ext_code,
-				unsigned int param32, unsigned long param64)
-{
-	struct siginfo info;
-
-	if (!(param32 & CPU_MF_INT_RI_MASK))
-		return;
-
-	inc_irq_stat(IRQEXT_CMR);
-
-	if (!current->thread.ri_cb)
-		return;
-	if (current->thread.ri_signum < SIGRTMIN ||
-	    current->thread.ri_signum > SIGRTMAX) {
-		WARN_ON_ONCE(1);
-		return;
-	}
-
-	memset(&info, 0, sizeof(info));
-	info.si_signo = current->thread.ri_signum;
-	info.si_code = SI_QUEUE;
-	if (param32 & CPU_MF_INT_RI_BUF_FULL)
-		info.si_int = ENOBUFS;
-	else if (param32 & CPU_MF_INT_RI_HALTED)
-		info.si_int = ECANCELED;
-	else
-		return; /* unknown reason */
-
-	send_sig_info(current->thread.ri_signum, &info, current);
-}
-
-SYSCALL_DEFINE2(s390_runtime_instr, int, command, int, signum)
+SYSCALL_DEFINE1(s390_runtime_instr, int, command)
 {
 	struct runtime_instr_cb *cb;
 
-	if (!runtime_instr_avail())
+	if (!test_facility(64))
 		return -EOPNOTSUPP;
 
 	if (command == S390_RUNTIME_INSTR_STOP) {
@@ -106,8 +68,7 @@ SYSCALL_DEFINE2(s390_runtime_instr, int, command, int, signum)
 		return 0;
 	}
 
-	if (command != S390_RUNTIME_INSTR_START ||
-	    (signum < SIGRTMIN || signum > SIGRTMAX))
+	if (command != S390_RUNTIME_INSTR_START)
 		return -EINVAL;
 
 	if (!current->thread.ri_cb) {
@@ -120,7 +81,6 @@ SYSCALL_DEFINE2(s390_runtime_instr, int, command, int, signum)
 	}
 
 	init_runtime_instr_cb(cb);
-	current->thread.ri_signum = signum;
 
 	/* now load the control block to make it available */
 	preempt_disable();
@@ -129,21 +89,3 @@ SYSCALL_DEFINE2(s390_runtime_instr, int, command, int, signum)
 	preempt_enable();
 	return 0;
 }
-
-static int __init runtime_instr_init(void)
-{
-	int rc;
-
-	if (!runtime_instr_avail())
-		return 0;
-
-	irq_subclass_register(IRQ_SUBCLASS_MEASUREMENT_ALERT);
-	rc = register_external_irq(EXT_IRQ_MEASURE_ALERT,
-				   runtime_instr_int_handler);
-	if (rc)
-		irq_subclass_unregister(IRQ_SUBCLASS_MEASUREMENT_ALERT);
-	else
-		pr_info("Runtime instrumentation facility initialized\n");
-	return rc;
-}
-device_initcall(runtime_instr_init);
diff --git a/arch/s390/kernel/s390_ksyms.c b/arch/s390/kernel/s390_ksyms.c
index 5090d3dad10b..e67453b73c3c 100644
--- a/arch/s390/kernel/s390_ksyms.c
+++ b/arch/s390/kernel/s390_ksyms.c
@@ -1,6 +1,6 @@
 #include <linux/module.h>
 #include <linux/kvm_host.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/api.h>
 #include <asm/ftrace.h>
 
 #ifdef CONFIG_FUNCTION_TRACER
@@ -10,7 +10,6 @@ EXPORT_SYMBOL(_mcount);
 EXPORT_SYMBOL(sie64a);
 EXPORT_SYMBOL(sie_exit);
 EXPORT_SYMBOL(save_fpu_regs);
-EXPORT_SYMBOL(__ctl_set_vx);
 #endif
 EXPORT_SYMBOL(memcpy);
 EXPORT_SYMBOL(memset);
diff --git a/arch/s390/kernel/signal.c b/arch/s390/kernel/signal.c
index 9549af102d75..028cc46cb82a 100644
--- a/arch/s390/kernel/signal.c
+++ b/arch/s390/kernel/signal.c
@@ -179,7 +179,7 @@ static int save_sigregs_ext(struct pt_regs *regs,
 	int i;
 
 	/* Save vector registers to signal stack */
-	if (is_vx_task(current)) {
+	if (MACHINE_HAS_VX) {
 		for (i = 0; i < __NUM_VXRS_LOW; i++)
 			vxrs[i] = *((__u64 *)(current->thread.fpu.vxrs + i) + 1);
 		if (__copy_to_user(&sregs_ext->vxrs_low, vxrs,
@@ -199,7 +199,7 @@ static int restore_sigregs_ext(struct pt_regs *regs,
 	int i;
 
 	/* Restore vector registers from signal stack */
-	if (is_vx_task(current)) {
+	if (MACHINE_HAS_VX) {
 		if (__copy_from_user(vxrs, &sregs_ext->vxrs_low,
 				     sizeof(sregs_ext->vxrs_low)) ||
 		    __copy_from_user(current->thread.fpu.vxrs + __NUM_VXRS_LOW,
@@ -381,8 +381,7 @@ static int setup_rt_frame(struct ksignal *ksig, sigset_t *set,
 	uc_flags = 0;
 	if (MACHINE_HAS_VX) {
 		frame_size += sizeof(_sigregs_ext);
-		if (is_vx_task(current))
-			uc_flags |= UC_VXRS;
+		uc_flags |= UC_VXRS;
 	}
 	frame = get_sigframe(&ksig->ka, regs, frame_size);
 	if (frame == (void __user *) -1UL)
diff --git a/arch/s390/kernel/smp.c b/arch/s390/kernel/smp.c
index c6355e6f3fcc..9062df575afe 100644
--- a/arch/s390/kernel/smp.c
+++ b/arch/s390/kernel/smp.c
@@ -33,6 +33,7 @@
 #include <linux/crash_dump.h>
 #include <linux/memblock.h>
 #include <asm/asm-offsets.h>
+#include <asm/diag.h>
 #include <asm/switch_to.h>
 #include <asm/facility.h>
 #include <asm/ipl.h>
@@ -261,6 +262,8 @@ static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk)
 		+ THREAD_SIZE - STACK_FRAME_OVERHEAD - sizeof(struct pt_regs);
 	lc->thread_info = (unsigned long) task_thread_info(tsk);
 	lc->current_task = (unsigned long) tsk;
+	lc->lpp = LPP_MAGIC;
+	lc->current_pid = tsk->pid;
 	lc->user_timer = ti->user_timer;
 	lc->system_timer = ti->system_timer;
 	lc->steal_timer = 0;
@@ -375,11 +378,14 @@ int smp_vcpu_scheduled(int cpu)
 
 void smp_yield_cpu(int cpu)
 {
-	if (MACHINE_HAS_DIAG9C)
+	if (MACHINE_HAS_DIAG9C) {
+		diag_stat_inc_norecursion(DIAG_STAT_X09C);
 		asm volatile("diag %0,0,0x9c"
 			     : : "d" (pcpu_devices[cpu].address));
-	else if (MACHINE_HAS_DIAG44)
+	} else if (MACHINE_HAS_DIAG44) {
+		diag_stat_inc_norecursion(DIAG_STAT_X044);
 		asm volatile("diag 0,0,0x44");
+	}
 }
 
 /*
diff --git a/arch/s390/kernel/time.c b/arch/s390/kernel/time.c
index 017c3a9bfc28..99f84ac31307 100644
--- a/arch/s390/kernel/time.c
+++ b/arch/s390/kernel/time.c
@@ -542,16 +542,17 @@ arch_initcall(etr_init);
  * Switch to local machine check. This is called when the last usable
  * ETR port goes inactive. After switch to local the clock is not in sync.
  */
-void etr_switch_to_local(void)
+int etr_switch_to_local(void)
 {
 	if (!etr_eacr.sl)
-		return;
+		return 0;
 	disable_sync_clock(NULL);
 	if (!test_and_set_bit(ETR_EVENT_SWITCH_LOCAL, &etr_events)) {
 		etr_eacr.es = etr_eacr.sl = 0;
 		etr_setr(&etr_eacr);
-		queue_work(time_sync_wq, &etr_work);
+		return 1;
 	}
+	return 0;
 }
 
 /*
@@ -560,16 +561,22 @@ void etr_switch_to_local(void)
  * After a ETR sync check the clock is not in sync. The machine check
  * is broadcasted to all cpus at the same time.
  */
-void etr_sync_check(void)
+int etr_sync_check(void)
 {
 	if (!etr_eacr.es)
-		return;
+		return 0;
 	disable_sync_clock(NULL);
 	if (!test_and_set_bit(ETR_EVENT_SYNC_CHECK, &etr_events)) {
 		etr_eacr.es = 0;
 		etr_setr(&etr_eacr);
-		queue_work(time_sync_wq, &etr_work);
+		return 1;
 	}
+	return 0;
+}
+
+void etr_queue_work(void)
+{
+	queue_work(time_sync_wq, &etr_work);
 }
 
 /*
@@ -1504,10 +1511,10 @@ static void stp_timing_alert(struct stp_irq_parm *intparm)
  * After a STP sync check the clock is not in sync. The machine check
  * is broadcasted to all cpus at the same time.
  */
-void stp_sync_check(void)
+int stp_sync_check(void)
 {
 	disable_sync_clock(NULL);
-	queue_work(time_sync_wq, &stp_work);
+	return 1;
 }
 
 /*
@@ -1516,12 +1523,16 @@ void stp_sync_check(void)
  * have matching CTN ids and have a valid stratum-1 configuration
  * but the configurations do not match.
  */
-void stp_island_check(void)
+int stp_island_check(void)
 {
 	disable_sync_clock(NULL);
-	queue_work(time_sync_wq, &stp_work);
+	return 1;
 }
 
+void stp_queue_work(void)
+{
+	queue_work(time_sync_wq, &stp_work);
+}
 
 static int stp_sync_clock(void *data)
 {
diff --git a/arch/s390/kernel/topology.c b/arch/s390/kernel/topology.c
index bf05e7fc3e70..40b8102fdadb 100644
--- a/arch/s390/kernel/topology.c
+++ b/arch/s390/kernel/topology.c
@@ -84,6 +84,7 @@ static struct mask_info *add_cpus_to_mask(struct topology_core *tl_core,
 					  struct mask_info *socket,
 					  int one_socket_per_cpu)
 {
+	struct cpu_topology_s390 *topo;
 	unsigned int core;
 
 	for_each_set_bit(core, &tl_core->mask[0], TOPOLOGY_CORE_BITS) {
@@ -95,15 +96,16 @@ static struct mask_info *add_cpus_to_mask(struct topology_core *tl_core,
 		if (lcpu < 0)
 			continue;
 		for (i = 0; i <= smp_cpu_mtid; i++) {
-			per_cpu(cpu_topology, lcpu + i).book_id = book->id;
-			per_cpu(cpu_topology, lcpu + i).core_id = rcore;
-			per_cpu(cpu_topology, lcpu + i).thread_id = lcpu + i;
+			topo = &per_cpu(cpu_topology, lcpu + i);
+			topo->book_id = book->id;
+			topo->core_id = rcore;
+			topo->thread_id = lcpu + i;
 			cpumask_set_cpu(lcpu + i, &book->mask);
 			cpumask_set_cpu(lcpu + i, &socket->mask);
 			if (one_socket_per_cpu)
-				per_cpu(cpu_topology, lcpu + i).socket_id = rcore;
+				topo->socket_id = rcore;
 			else
-				per_cpu(cpu_topology, lcpu + i).socket_id = socket->id;
+				topo->socket_id = socket->id;
 			smp_cpu_set_polarization(lcpu + i, tl_core->pp);
 		}
 		if (one_socket_per_cpu)
@@ -247,17 +249,19 @@ int topology_set_cpu_management(int fc)
 
 static void update_cpu_masks(void)
 {
+	struct cpu_topology_s390 *topo;
 	int cpu;
 
 	for_each_possible_cpu(cpu) {
-		per_cpu(cpu_topology, cpu).thread_mask = cpu_thread_map(cpu);
-		per_cpu(cpu_topology, cpu).core_mask = cpu_group_map(&socket_info, cpu);
-		per_cpu(cpu_topology, cpu).book_mask = cpu_group_map(&book_info, cpu);
+		topo = &per_cpu(cpu_topology, cpu);
+		topo->thread_mask = cpu_thread_map(cpu);
+		topo->core_mask = cpu_group_map(&socket_info, cpu);
+		topo->book_mask = cpu_group_map(&book_info, cpu);
 		if (!MACHINE_HAS_TOPOLOGY) {
-			per_cpu(cpu_topology, cpu).thread_id = cpu;
-			per_cpu(cpu_topology, cpu).core_id = cpu;
-			per_cpu(cpu_topology, cpu).socket_id = cpu;
-			per_cpu(cpu_topology, cpu).book_id = cpu;
+			topo->thread_id = cpu;
+			topo->core_id = cpu;
+			topo->socket_id = cpu;
+			topo->book_id = cpu;
 		}
 	}
 	numa_update_cpu_topology();
diff --git a/arch/s390/kernel/trace.c b/arch/s390/kernel/trace.c
new file mode 100644
index 000000000000..73239bb576c4
--- /dev/null
+++ b/arch/s390/kernel/trace.c
@@ -0,0 +1,29 @@
+/*
+ * Tracepoint definitions for s390
+ *
+ * Copyright IBM Corp. 2015
+ * Author(s): Martin Schwidefsky <schwidefsky@de.ibm.com>
+ */
+
+#include <linux/percpu.h>
+#define CREATE_TRACE_POINTS
+#include <asm/trace/diag.h>
+
+EXPORT_TRACEPOINT_SYMBOL(diagnose);
+
+static DEFINE_PER_CPU(unsigned int, diagnose_trace_depth);
+
+void trace_diagnose_norecursion(int diag_nr)
+{
+	unsigned long flags;
+	unsigned int *depth;
+
+	local_irq_save(flags);
+	depth = this_cpu_ptr(&diagnose_trace_depth);
+	if (*depth == 0) {
+		(*depth)++;
+		trace_diagnose(diag_nr);
+		(*depth)--;
+	}
+	local_irq_restore(flags);
+}
diff --git a/arch/s390/kernel/traps.c b/arch/s390/kernel/traps.c
index 9861613fb35a..1b18118bbc06 100644
--- a/arch/s390/kernel/traps.c
+++ b/arch/s390/kernel/traps.c
@@ -19,7 +19,7 @@
 #include <linux/sched.h>
 #include <linux/mm.h>
 #include <linux/slab.h>
-#include <asm/fpu-internal.h>
+#include <asm/fpu/api.h>
 #include "entry.h"
 
 int show_unhandled_signals = 1;
@@ -224,29 +224,6 @@ NOKPROBE_SYMBOL(illegal_op);
 DO_ERROR_INFO(specification_exception, SIGILL, ILL_ILLOPN,
 	      "specification exception");
 
-int alloc_vector_registers(struct task_struct *tsk)
-{
-	__vector128 *vxrs;
-	freg_t *fprs;
-
-	/* Allocate vector register save area. */
-	vxrs = kzalloc(sizeof(__vector128) * __NUM_VXRS,
-		       GFP_KERNEL|__GFP_REPEAT);
-	if (!vxrs)
-		return -ENOMEM;
-	preempt_disable();
-	if (tsk == current)
-		save_fpu_regs();
-	/* Copy the 16 floating point registers */
-	convert_fp_to_vx(vxrs, tsk->thread.fpu.fprs);
-	fprs = tsk->thread.fpu.fprs;
-	tsk->thread.fpu.vxrs = vxrs;
-	tsk->thread.fpu.flags |= FPU_USE_VX;
-	kfree(fprs);
-	preempt_enable();
-	return 0;
-}
-
 void vector_exception(struct pt_regs *regs)
 {
 	int si_code, vic;
@@ -281,13 +258,6 @@ void vector_exception(struct pt_regs *regs)
 	do_trap(regs, SIGFPE, si_code, "vector exception");
 }
 
-static int __init disable_vector_extension(char *str)
-{
-	S390_lowcore.machine_flags &= ~MACHINE_FLAG_VX;
-	return 1;
-}
-__setup("novx", disable_vector_extension);
-
 void data_exception(struct pt_regs *regs)
 {
 	__u16 __user *location;
@@ -296,15 +266,6 @@ void data_exception(struct pt_regs *regs)
 	location = get_trap_ip(regs);
 
 	save_fpu_regs();
-	/* Check for vector register enablement */
-	if (MACHINE_HAS_VX && !is_vx_task(current) &&
-	    (current->thread.fpu.fpc & FPC_DXC_MASK) == 0xfe00) {
-		alloc_vector_registers(current);
-		/* Vector data exception is suppressing, rewind psw. */
-		regs->psw.addr = __rewind_psw(regs->psw, regs->int_code >> 16);
-		clear_pt_regs_flag(regs, PIF_PER_TRAP);
-		return;
-	}
 	if (current->thread.fpu.fpc & FPC_DXC_MASK)
 		signal = SIGFPE;
 	else
diff --git a/arch/s390/kernel/vdso.c b/arch/s390/kernel/vdso.c
index 0d58269ff425..59eddb0e1a3e 100644
--- a/arch/s390/kernel/vdso.c
+++ b/arch/s390/kernel/vdso.c
@@ -299,7 +299,7 @@ static int __init vdso_init(void)
 
 	get_page(virt_to_page(vdso_data));
 
-	smp_wmb();
+	smp_mb();
 
 	return 0;
 }
diff --git a/arch/s390/kvm/kvm-s390.c b/arch/s390/kvm/kvm-s390.c
index 0a67c40eece9..c6b4063fce29 100644
--- a/arch/s390/kvm/kvm-s390.c
+++ b/arch/s390/kvm/kvm-s390.c
@@ -1292,7 +1292,6 @@ int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
 static inline void save_fpu_to(struct fpu *dst)
 {
 	dst->fpc = current->thread.fpu.fpc;
-	dst->flags = current->thread.fpu.flags;
 	dst->regs = current->thread.fpu.regs;
 }
 
@@ -1303,7 +1302,6 @@ static inline void save_fpu_to(struct fpu *dst)
 static inline void load_fpu_from(struct fpu *from)
 {
 	current->thread.fpu.fpc = from->fpc;
-	current->thread.fpu.flags = from->flags;
 	current->thread.fpu.regs = from->regs;
 }
 
@@ -1315,15 +1313,12 @@ void kvm_arch_vcpu_load(struct kvm_vcpu *vcpu, int cpu)
 
 	if (test_kvm_facility(vcpu->kvm, 129)) {
 		current->thread.fpu.fpc = vcpu->run->s.regs.fpc;
-		current->thread.fpu.flags = FPU_USE_VX;
 		/*
 		 * Use the register save area in the SIE-control block
 		 * for register restore and save in kvm_arch_vcpu_put()
 		 */
 		current->thread.fpu.vxrs =
 			(__vector128 *)&vcpu->run->s.regs.vrs;
-		/* Always enable the vector extension for KVM */
-		__ctl_set_vx();
 	} else
 		load_fpu_from(&vcpu->arch.guest_fpregs);
 
@@ -2326,7 +2321,6 @@ int kvm_s390_vcpu_store_status(struct kvm_vcpu *vcpu, unsigned long addr)
 		 * registers and the FPC value and store them in the
 		 * guest_fpregs structure.
 		 */
-		WARN_ON(!is_vx_task(current));	  /* XXX remove later */
 		vcpu->arch.guest_fpregs.fpc = current->thread.fpu.fpc;
 		convert_vx_to_fp(vcpu->arch.guest_fpregs.fprs,
 				 current->thread.fpu.vxrs);
diff --git a/arch/s390/lib/delay.c b/arch/s390/lib/delay.c
index 246a7eb4b680..501dcd4ca4a0 100644
--- a/arch/s390/lib/delay.c
+++ b/arch/s390/lib/delay.c
@@ -12,8 +12,10 @@
 #include <linux/module.h>
 #include <linux/irqflags.h>
 #include <linux/interrupt.h>
+#include <linux/irq.h>
 #include <asm/vtimer.h>
 #include <asm/div64.h>
+#include <asm/idle.h>
 
 void __delay(unsigned long loops)
 {
@@ -30,26 +32,22 @@ EXPORT_SYMBOL(__delay);
 
 static void __udelay_disabled(unsigned long long usecs)
 {
-	unsigned long cr0, cr6, new;
-	u64 clock_saved, end;
+	unsigned long cr0, cr0_new, psw_mask;
+	struct s390_idle_data idle;
+	u64 end;
 
 	end = get_tod_clock() + (usecs << 12);
-	clock_saved = local_tick_disable();
 	__ctl_store(cr0, 0, 0);
-	__ctl_store(cr6, 6, 6);
-	new = (cr0 &  0xffff00e0) | 0x00000800;
-	__ctl_load(new , 0, 0);
-	new = 0;
-	__ctl_load(new, 6, 6);
-	lockdep_off();
-	do {
-		set_clock_comparator(end);
-		enabled_wait();
-	} while (get_tod_clock_fast() < end);
-	lockdep_on();
+	cr0_new = cr0 & ~CR0_IRQ_SUBCLASS_MASK;
+	cr0_new |= (1UL << (63 - 52)); /* enable clock comparator irq */
+	__ctl_load(cr0_new, 0, 0);
+	psw_mask = __extract_psw() | PSW_MASK_EXT | PSW_MASK_WAIT;
+	set_clock_comparator(end);
+	set_cpu_flag(CIF_IGNORE_IRQ);
+	psw_idle(&idle, psw_mask);
+	clear_cpu_flag(CIF_IGNORE_IRQ);
+	set_clock_comparator(S390_lowcore.clock_comparator);
 	__ctl_load(cr0, 0, 0);
-	__ctl_load(cr6, 6, 6);
-	local_tick_enable(clock_saved);
 }
 
 static void __udelay_enabled(unsigned long long usecs)
diff --git a/arch/s390/lib/find.c b/arch/s390/lib/find.c
index 922003c1b90d..d90b9245ea41 100644
--- a/arch/s390/lib/find.c
+++ b/arch/s390/lib/find.c
@@ -1,10 +1,8 @@
 /*
  * MSB0 numbered special bitops handling.
  *
- * On s390x the bits are numbered:
+ * The bits are numbered:
  *   |0..............63|64............127|128...........191|192...........255|
- * and on s390:
- *   |0.....31|32....63|64....95|96...127|128..159|160..191|192..223|224..255|
  *
  * The reason for this bit numbering is the fact that the hardware sets bits
  * in a bitmap starting at bit 0 (MSB) and we don't want to scan the bitmap
diff --git a/arch/s390/lib/spinlock.c b/arch/s390/lib/spinlock.c
index d6c9991f7797..427aa44b3505 100644
--- a/arch/s390/lib/spinlock.c
+++ b/arch/s390/lib/spinlock.c
@@ -197,7 +197,7 @@ void _raw_write_lock_wait(arch_rwlock_t *rw, unsigned int prev)
 		}
 		old = ACCESS_ONCE(rw->lock);
 		owner = ACCESS_ONCE(rw->owner);
-		smp_rmb();
+		smp_mb();
 		if ((int) old >= 0) {
 			prev = __RAW_LOCK(&rw->lock, 0x80000000, __RAW_OP_OR);
 			old = prev;
@@ -231,7 +231,7 @@ void _raw_write_lock_wait(arch_rwlock_t *rw)
 		    _raw_compare_and_swap(&rw->lock, old, old | 0x80000000))
 			prev = old;
 		else
-			smp_rmb();
+			smp_mb();
 		if ((old & 0x7fffffff) == 0 && (int) prev >= 0)
 			break;
 		if (MACHINE_HAS_CAD)
diff --git a/arch/s390/mm/extmem.c b/arch/s390/mm/extmem.c
index 23c496957c22..18fccc303db7 100644
--- a/arch/s390/mm/extmem.c
+++ b/arch/s390/mm/extmem.c
@@ -18,6 +18,7 @@
 #include <linux/bootmem.h>
 #include <linux/ctype.h>
 #include <linux/ioport.h>
+#include <asm/diag.h>
 #include <asm/page.h>
 #include <asm/pgtable.h>
 #include <asm/ebcdic.h>
@@ -112,6 +113,7 @@ dcss_set_subcodes(void)
 	ry = DCSS_FINDSEGX;
 
 	strcpy(name, "dummy");
+	diag_stat_inc(DIAG_STAT_X064);
 	asm volatile(
 		"	diag	%0,%1,0x64\n"
 		"0:	ipm	%2\n"
@@ -205,6 +207,7 @@ dcss_diag(int *func, void *parameter,
 	ry = (unsigned long) *func;
 
 	/* 64-bit Diag x'64' new subcode, keep in 64-bit addressing mode */
+	diag_stat_inc(DIAG_STAT_X064);
 	if (*func > DCSS_SEGEXT)
 		asm volatile(
 			"	diag	%0,%1,0x64\n"
diff --git a/arch/s390/mm/fault.c b/arch/s390/mm/fault.c
index f985856a538b..ec1a30d0d11a 100644
--- a/arch/s390/mm/fault.c
+++ b/arch/s390/mm/fault.c
@@ -30,6 +30,7 @@
 #include <linux/uaccess.h>
 #include <linux/hugetlb.h>
 #include <asm/asm-offsets.h>
+#include <asm/diag.h>
 #include <asm/pgtable.h>
 #include <asm/irq.h>
 #include <asm/mmu_context.h>
@@ -589,7 +590,7 @@ int pfault_init(void)
 		.reffcode = 0,
 		.refdwlen = 5,
 		.refversn = 2,
-		.refgaddr = __LC_CURRENT_PID,
+		.refgaddr = __LC_LPP,
 		.refselmk = 1ULL << 48,
 		.refcmpmk = 1ULL << 48,
 		.reserved = __PF_RES_FIELD };
@@ -597,6 +598,7 @@ int pfault_init(void)
 
 	if (pfault_disable)
 		return -1;
+	diag_stat_inc(DIAG_STAT_X258);
 	asm volatile(
 		"	diag	%1,%0,0x258\n"
 		"0:	j	2f\n"
@@ -618,6 +620,7 @@ void pfault_fini(void)
 
 	if (pfault_disable)
 		return;
+	diag_stat_inc(DIAG_STAT_X258);
 	asm volatile(
 		"	diag	%0,0,0x258\n"
 		"0:\n"
@@ -646,7 +649,7 @@ static void pfault_interrupt(struct ext_code ext_code,
 		return;
 	inc_irq_stat(IRQEXT_PFL);
 	/* Get the token (= pid of the affected task). */
-	pid = param64;
+	pid = param64 & LPP_PFAULT_PID_MASK;
 	rcu_read_lock();
 	tsk = find_task_by_pid_ns(pid, &init_pid_ns);
 	if (tsk)
diff --git a/arch/s390/mm/hugetlbpage.c b/arch/s390/mm/hugetlbpage.c
index fb4bf2c4379e..f81096b6940d 100644
--- a/arch/s390/mm/hugetlbpage.c
+++ b/arch/s390/mm/hugetlbpage.c
@@ -40,6 +40,7 @@ static inline pmd_t __pte_to_pmd(pte_t pte)
 		pmd_val(pmd) |= (pte_val(pte) & _PAGE_PROTECT);
 		pmd_val(pmd) |= (pte_val(pte) & _PAGE_DIRTY) << 10;
 		pmd_val(pmd) |= (pte_val(pte) & _PAGE_YOUNG) << 10;
+		pmd_val(pmd) |= (pte_val(pte) & _PAGE_SOFT_DIRTY) << 13;
 	} else
 		pmd_val(pmd) = _SEGMENT_ENTRY_INVALID;
 	return pmd;
@@ -78,6 +79,7 @@ static inline pte_t __pmd_to_pte(pmd_t pmd)
 		pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_PROTECT);
 		pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_DIRTY) >> 10;
 		pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_YOUNG) >> 10;
+		pte_val(pte) |= (pmd_val(pmd) & _SEGMENT_ENTRY_SOFT_DIRTY) >> 13;
 	} else
 		pte_val(pte) = _PAGE_INVALID;
 	return pte;
diff --git a/arch/s390/numa/mode_emu.c b/arch/s390/numa/mode_emu.c
index 30b2698a28e2..828d0695d0d4 100644
--- a/arch/s390/numa/mode_emu.c
+++ b/arch/s390/numa/mode_emu.c
@@ -436,9 +436,15 @@ static void emu_update_cpu_topology(void)
  */
 static unsigned long emu_setup_size_adjust(unsigned long size)
 {
+	unsigned long size_new;
+
 	size = size ? : CONFIG_EMU_SIZE;
-	size = roundup(size, memory_block_size_bytes());
-	return size;
+	size_new = roundup(size, memory_block_size_bytes());
+	if (size_new == size)
+		return size;
+	pr_warn("Increasing memory stripe size from %ld MB to %ld MB\n",
+		size >> 20, size_new >> 20);
+	return size_new;
 }
 
 /*
diff --git a/arch/s390/pci/pci_insn.c b/arch/s390/pci/pci_insn.c
index dcc2634ccbe2..10ca15dcab11 100644
--- a/arch/s390/pci/pci_insn.c
+++ b/arch/s390/pci/pci_insn.c
@@ -16,11 +16,11 @@
 static inline void zpci_err_insn(u8 cc, u8 status, u64 req, u64 offset)
 {
 	struct {
-		u8 cc;
-		u8 status;
 		u64 req;
 		u64 offset;
-	} data = {cc, status, req, offset};
+		u8 cc;
+		u8 status;
+	} __packed data = {req, offset, cc, status};
 
 	zpci_err_hex(&data, sizeof(data));
 }
diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h
index c0b41f111a9a..6ec0c8b2e9df 100644
--- a/arch/x86/include/asm/pgtable.h
+++ b/arch/x86/include/asm/pgtable.h
@@ -325,6 +325,16 @@ static inline pmd_t pmd_mksoft_dirty(pmd_t pmd)
 	return pmd_set_flags(pmd, _PAGE_SOFT_DIRTY);
 }
 
+static inline pte_t pte_clear_soft_dirty(pte_t pte)
+{
+	return pte_clear_flags(pte, _PAGE_SOFT_DIRTY);
+}
+
+static inline pmd_t pmd_clear_soft_dirty(pmd_t pmd)
+{
+	return pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY);
+}
+
 #endif /* CONFIG_HAVE_ARCH_SOFT_DIRTY */
 
 /*