summary refs log tree commit diff
diff options
context:
space:
mode:
-rw-r--r--Documentation/DocBook/kernel-hacking.tmpl2
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-meson.txt23
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-mtk-disp.txt3
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-st.txt8
-rw-r--r--Documentation/devicetree/bindings/pwm/pwm-sun4i.txt1
-rw-r--r--Documentation/devicetree/bindings/thermal/max77620_thermal.txt70
-rw-r--r--Documentation/devicetree/bindings/thermal/mediatek-thermal.txt4
-rw-r--r--Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.txt121
-rw-r--r--Documentation/devicetree/bindings/thermal/qcom-tsens.txt21
-rw-r--r--Documentation/devicetree/bindings/watchdog/of-xilinx-wdt.txt3
-rw-r--r--Documentation/devicetree/bindings/watchdog/st_lpc_wdt.txt3
-rw-r--r--Documentation/kernel-parameters.txt12
-rw-r--r--Documentation/thermal/sysfs-api.txt19
-rw-r--r--Documentation/watchdog/watchdog-kernel-api.txt33
-rw-r--r--MAINTAINERS21
-rw-r--r--Makefile1
-rw-r--r--arch/arm/boot/dts/tegra124-jetson-tk1.dts18
-rw-r--r--arch/arm/boot/dts/tegra124.dtsi83
-rw-r--r--arch/arm/configs/exynos_defconfig2
-rw-r--r--arch/arm/mm/fault.h1
-rw-r--r--arch/arm64/boot/dts/nvidia/tegra132.dtsi119
-rw-r--r--arch/arm64/boot/dts/nvidia/tegra210.dtsi127
-rw-r--r--arch/frv/include/asm/pgtable.h1
-rw-r--r--arch/frv/include/asm/segment.h1
-rw-r--r--arch/frv/include/asm/uaccess.h2
-rw-r--r--arch/m68k/include/asm/uaccess_no.h3
-rw-r--r--arch/microblaze/include/asm/uaccess.h3
-rw-r--r--arch/mips/include/asm/extable.h13
-rw-r--r--arch/mips/include/asm/module.h2
-rw-r--r--arch/mips/include/asm/uaccess.h9
-rw-r--r--arch/mips/lasat/picvue_proc.c1
-rw-r--r--arch/mn10300/include/asm/processor.h1
-rw-r--r--arch/mn10300/include/asm/uaccess.h7
-rw-r--r--arch/mn10300/kernel/signal.c6
-rw-r--r--arch/openrisc/include/asm/uaccess.h4
-rw-r--r--arch/parisc/include/asm/pgtable.h6
-rw-r--r--arch/parisc/include/asm/traps.h1
-rw-r--r--arch/parisc/kernel/traps.c4
-rw-r--r--arch/parisc/kernel/vmlinux.lds.S14
-rw-r--r--arch/parisc/mm/fault.c19
-rw-r--r--arch/parisc/mm/init.c2
-rw-r--r--arch/score/include/asm/extable.h11
-rw-r--r--arch/score/include/asm/module.h2
-rw-r--r--arch/score/include/asm/uaccess.h8
-rw-r--r--arch/sh/include/asm/uaccess.h2
-rw-r--r--arch/sparc/include/asm/elf_64.h2
-rw-r--r--arch/sparc/include/asm/extable_64.h20
-rw-r--r--arch/sparc/include/asm/uaccess_64.h18
-rw-r--r--arch/x86/include/asm/cacheflush.h1
-rw-r--r--arch/x86/include/asm/extable.h35
-rw-r--r--arch/x86/include/asm/sections.h2
-rw-r--r--arch/x86/include/asm/uaccess.h32
-rw-r--r--arch/x86/mm/fault.c2
-rw-r--r--arch/xtensa/include/asm/asm-uaccess.h160
-rw-r--r--arch/xtensa/include/asm/uaccess.h153
-rw-r--r--arch/xtensa/kernel/coprocessor.S2
-rw-r--r--arch/xtensa/kernel/entry.S2
-rw-r--r--drivers/acpi/thermal.c3
-rw-r--r--drivers/char/tb0219.c1
-rw-r--r--drivers/net/bonding/bond_main.c4
-rw-r--r--drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c4
-rw-r--r--drivers/net/ethernet/qlogic/qed/qed_ll2.c4
-rw-r--r--drivers/net/ethernet/qlogic/qed/qed_roce.c2
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac_main.c21
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c10
-rw-r--r--drivers/net/ethernet/ti/tlan.c2
-rw-r--r--drivers/net/ethernet/xilinx/xilinx_axienet_main.c11
-rw-r--r--drivers/net/hyperv/netvsc_drv.c71
-rw-r--r--drivers/net/phy/phy.c22
-rw-r--r--drivers/net/usb/qmi_wwan.c30
-rw-r--r--drivers/net/xen-netback/common.h4
-rw-r--r--drivers/net/xen-netback/hash.c68
-rw-r--r--drivers/net/xen-netback/rx.c8
-rw-r--r--drivers/net/xen-netback/xenbus.c37
-rw-r--r--drivers/pci/host/pci-aardvark.c39
-rw-r--r--drivers/pci/host/pci-dra7xx.c103
-rw-r--r--drivers/pci/host/pci-exynos.c220
-rw-r--r--drivers/pci/host/pci-imx6.c250
-rw-r--r--drivers/pci/host/pci-keystone-dw.c123
-rw-r--r--drivers/pci/host/pci-keystone.c28
-rw-r--r--drivers/pci/host/pci-keystone.h9
-rw-r--r--drivers/pci/host/pci-layerscape.c65
-rw-r--r--drivers/pci/host/pci-mvebu.c21
-rw-r--r--drivers/pci/host/pci-rcar-gen2.c46
-rw-r--r--drivers/pci/host/pci-tegra.c237
-rw-r--r--drivers/pci/host/pci-xgene.c151
-rw-r--r--drivers/pci/host/pcie-altera.c77
-rw-r--r--drivers/pci/host/pcie-armada8k.c78
-rw-r--r--drivers/pci/host/pcie-artpec6.c115
-rw-r--r--drivers/pci/host/pcie-designware-plat.c25
-rw-r--r--drivers/pci/host/pcie-designware.c102
-rw-r--r--drivers/pci/host/pcie-designware.h7
-rw-r--r--drivers/pci/host/pcie-hisi.c86
-rw-r--r--drivers/pci/host/pcie-iproc-bcma.c14
-rw-r--r--drivers/pci/host/pcie-iproc-platform.c27
-rw-r--r--drivers/pci/host/pcie-iproc.c52
-rw-r--r--drivers/pci/host/pcie-qcom.c30
-rw-r--r--drivers/pci/host/pcie-rcar.c98
-rw-r--r--drivers/pci/host/pcie-rockchip.c4
-rw-r--r--drivers/pci/host/pcie-spear13xx.c108
-rw-r--r--drivers/pci/host/pcie-xilinx-nwl.c109
-rw-r--r--drivers/pci/host/pcie-xilinx.c62
-rw-r--r--drivers/platform/x86/acerhdf.c2
-rw-r--r--drivers/platform/x86/asus-laptop.c77
-rw-r--r--drivers/platform/x86/asus-nb-wmi.c58
-rw-r--r--drivers/platform/x86/asus-wmi.c3
-rw-r--r--drivers/platform/x86/asus-wmi.h5
-rw-r--r--drivers/platform/x86/dell-smo8800.c1
-rw-r--r--drivers/platform/x86/intel_pmc_core.c2
-rw-r--r--drivers/platform/x86/intel_pmc_ipc.c110
-rw-r--r--drivers/platform/x86/toshiba_acpi.c257
-rw-r--r--drivers/platform/x86/toshiba_bluetooth.c4
-rw-r--r--drivers/platform/x86/toshiba_haps.c14
-rw-r--r--drivers/ptp/ptp_chardev.c1
-rw-r--r--drivers/pwm/Kconfig9
-rw-r--r--drivers/pwm/Makefile1
-rw-r--r--drivers/pwm/core.c2
-rw-r--r--drivers/pwm/pwm-berlin.c84
-rw-r--r--drivers/pwm/pwm-cros-ec.c4
-rw-r--r--drivers/pwm/pwm-lpc18xx-sct.c12
-rw-r--r--drivers/pwm/pwm-meson.c529
-rw-r--r--drivers/pwm/pwm-mtk-disp.c87
-rw-r--r--drivers/pwm/pwm-samsung.c15
-rw-r--r--drivers/pwm/pwm-sti.c483
-rw-r--r--drivers/pwm/pwm-sun4i.c9
-rw-r--r--drivers/pwm/pwm-tipwmss.c19
-rw-r--r--drivers/pwm/pwm-twl.c16
-rw-r--r--drivers/pwm/sysfs.c18
-rw-r--r--drivers/regulator/max8973-regulator.c3
-rw-r--r--drivers/thermal/Kconfig37
-rw-r--r--drivers/thermal/Makefile4
-rw-r--r--drivers/thermal/cpu_cooling.c2
-rw-r--r--drivers/thermal/db8500_thermal.c2
-rw-r--r--drivers/thermal/devfreq_cooling.c2
-rw-r--r--drivers/thermal/gov_bang_bang.c2
-rw-r--r--drivers/thermal/hisi_thermal.c3
-rw-r--r--drivers/thermal/imx_thermal.c4
-rw-r--r--drivers/thermal/int340x_thermal/int3402_thermal.c3
-rw-r--r--drivers/thermal/int340x_thermal/int3403_thermal.c9
-rw-r--r--drivers/thermal/int340x_thermal/int340x_thermal_zone.c60
-rw-r--r--drivers/thermal/int340x_thermal/int340x_thermal_zone.h6
-rw-r--r--drivers/thermal/int340x_thermal/processor_thermal_device.c3
-rw-r--r--drivers/thermal/intel_bxt_pmic_thermal.c300
-rw-r--r--drivers/thermal/intel_soc_dts_iosf.c3
-rw-r--r--drivers/thermal/max77620_thermal.c166
-rw-r--r--drivers/thermal/mtk_thermal.c226
-rw-r--r--drivers/thermal/of-thermal.c46
-rw-r--r--drivers/thermal/qcom-spmi-temp-alarm.c2
-rw-r--r--drivers/thermal/qcom/Kconfig11
-rw-r--r--drivers/thermal/qcom/Makefile2
-rw-r--r--drivers/thermal/qcom/tsens-8916.c113
-rw-r--r--drivers/thermal/qcom/tsens-8960.c292
-rw-r--r--drivers/thermal/qcom/tsens-8974.c244
-rw-r--r--drivers/thermal/qcom/tsens-8996.c84
-rw-r--r--drivers/thermal/qcom/tsens-common.c141
-rw-r--r--drivers/thermal/qcom/tsens.c200
-rw-r--r--drivers/thermal/qcom/tsens.h94
-rw-r--r--drivers/thermal/qoriq_thermal.c328
-rw-r--r--drivers/thermal/rcar_thermal.c26
-rw-r--r--drivers/thermal/rockchip_thermal.c107
-rw-r--r--drivers/thermal/samsung/exynos_tmu.c2
-rw-r--r--drivers/thermal/st/st_thermal_memmap.c3
-rw-r--r--drivers/thermal/tango_thermal.c19
-rw-r--r--drivers/thermal/tegra/soctherm.c842
-rw-r--r--drivers/thermal/tegra/soctherm.h10
-rw-r--r--drivers/thermal/tegra/tegra124-soctherm.c18
-rw-r--r--drivers/thermal/tegra/tegra132-soctherm.c18
-rw-r--r--drivers/thermal/tegra/tegra210-soctherm.c18
-rw-r--r--drivers/thermal/thermal_core.c106
-rw-r--r--drivers/thermal/ti-soc-thermal/ti-thermal-common.c29
-rw-r--r--drivers/thermal/user_space.c15
-rw-r--r--drivers/thermal/x86_pkg_temp_thermal.c3
-rw-r--r--drivers/video/fbdev/Kconfig7
-rw-r--r--drivers/video/fbdev/Makefile3
-rw-r--r--drivers/video/fbdev/amba-clcd-nomadik.c259
-rw-r--r--drivers/video/fbdev/amba-clcd-nomadik.h24
-rw-r--r--drivers/video/fbdev/amba-clcd-versatile.c395
-rw-r--r--drivers/video/fbdev/amba-clcd-versatile.h17
-rw-r--r--drivers/video/fbdev/amba-clcd.c190
-rw-r--r--drivers/video/fbdev/arcfb.c4
-rw-r--r--drivers/video/fbdev/asiliantfb.c4
-rw-r--r--drivers/video/fbdev/aty/aty128fb.c6
-rw-r--r--drivers/video/fbdev/aty/atyfb_base.c2
-rw-r--r--drivers/video/fbdev/aty/radeon_monitor.c2
-rw-r--r--drivers/video/fbdev/au1200fb.c1
-rw-r--r--drivers/video/fbdev/bfin_adv7393fb.c5
-rw-r--r--drivers/video/fbdev/efifb.c6
-rw-r--r--drivers/video/fbdev/exynos/Kconfig32
-rw-r--r--drivers/video/fbdev/exynos/Makefile9
-rw-r--r--drivers/video/fbdev/exynos/exynos_mipi_dsi.c574
-rw-r--r--drivers/video/fbdev/exynos/exynos_mipi_dsi_common.c880
-rw-r--r--drivers/video/fbdev/exynos/exynos_mipi_dsi_common.h46
-rw-r--r--drivers/video/fbdev/exynos/exynos_mipi_dsi_lowlevel.c618
-rw-r--r--drivers/video/fbdev/exynos/exynos_mipi_dsi_lowlevel.h112
-rw-r--r--drivers/video/fbdev/exynos/exynos_mipi_dsi_regs.h149
-rw-r--r--drivers/video/fbdev/exynos/s6e8ax0.c887
-rw-r--r--drivers/video/fbdev/hecubafb.c4
-rw-r--r--drivers/video/fbdev/hgafb.c2
-rw-r--r--drivers/video/fbdev/i740fb.c2
-rw-r--r--drivers/video/fbdev/i810/i810_main.c2
-rw-r--r--drivers/video/fbdev/intelfb/intelfbdrv.c5
-rw-r--r--drivers/video/fbdev/kyro/fbdev.c2
-rw-r--r--drivers/video/fbdev/matrox/matroxfb_Ti3026.c2
-rw-r--r--drivers/video/fbdev/matrox/matroxfb_g450.c2
-rw-r--r--drivers/video/fbdev/mb862xx/mb862xx-i2c.c9
-rw-r--r--drivers/video/fbdev/mx3fb.c2
-rw-r--r--drivers/video/fbdev/mxsfb.c9
-rw-r--r--drivers/video/fbdev/offb.c15
-rw-r--r--drivers/video/fbdev/omap/lcd_mipid.c9
-rw-r--r--drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c14
-rw-r--r--drivers/video/fbdev/omap2/omapfb/dss/dispc-compat.c7
-rw-r--r--drivers/video/fbdev/omap2/omapfb/dss/dsi.c12
-rw-r--r--drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c8
-rw-r--r--drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c8
-rw-r--r--drivers/video/fbdev/pm2fb.c2
-rw-r--r--drivers/video/fbdev/pxafb.c3
-rw-r--r--drivers/video/fbdev/s1d13xxxfb.c4
-rw-r--r--drivers/video/fbdev/s3c2410fb.c2
-rw-r--r--drivers/video/fbdev/s3c2410fb.h2
-rw-r--r--drivers/video/fbdev/savage/savagefb_driver.c2
-rw-r--r--drivers/video/fbdev/simplefb.c13
-rw-r--r--drivers/video/fbdev/sm712fb.c2
-rw-r--r--drivers/video/fbdev/smscufx.c2
-rw-r--r--drivers/video/fbdev/ssd1307fb.c9
-rw-r--r--drivers/video/fbdev/tdfxfb.c4
-rw-r--r--drivers/video/fbdev/uvesafb.c2
-rw-r--r--drivers/video/fbdev/vfb.c129
-rw-r--r--drivers/video/fbdev/vga16fb.c2
-rw-r--r--drivers/watchdog/Kconfig49
-rw-r--r--drivers/watchdog/Makefile8
-rw-r--r--drivers/watchdog/asm9260_wdt.c1
-rw-r--r--drivers/watchdog/ath79_wdt.c1
-rw-r--r--drivers/watchdog/bcm7038_wdt.c2
-rw-r--r--drivers/watchdog/cadence_wdt.c20
-rw-r--r--drivers/watchdog/dw_wdt.c11
-rw-r--r--drivers/watchdog/hpwdt.c8
-rw-r--r--drivers/watchdog/iTCO_wdt.c2
-rw-r--r--drivers/watchdog/imx2_wdt.c60
-rw-r--r--drivers/watchdog/kempld_wdt.c2
-rw-r--r--drivers/watchdog/mt7621_wdt.c1
-rw-r--r--drivers/watchdog/of_xilinx_wdt.c25
-rw-r--r--drivers/watchdog/pretimeout_noop.c47
-rw-r--r--drivers/watchdog/pretimeout_panic.c47
-rw-r--r--drivers/watchdog/rn5t618_wdt.c2
-rw-r--r--drivers/watchdog/rt2880_wdt.c1
-rw-r--r--drivers/watchdog/softdog.c24
-rw-r--r--drivers/watchdog/st_lpc_wdt.c33
-rw-r--r--drivers/watchdog/tegra_wdt.c2
-rw-r--r--drivers/watchdog/txx9wdt.c6
-rw-r--r--drivers/watchdog/w83627hf_wdt.c2
-rw-r--r--drivers/watchdog/watchdog_core.c2
-rw-r--r--drivers/watchdog/watchdog_dev.c101
-rw-r--r--drivers/watchdog/watchdog_pretimeout.c220
-rw-r--r--drivers/watchdog/watchdog_pretimeout.h60
-rw-r--r--drivers/watchdog/ziirave_wdt.c409
-rw-r--r--fs/compat_ioctl.c2
-rw-r--r--fs/exportfs/expfs.c10
-rw-r--r--fs/nfs/cache_lib.c2
-rw-r--r--fs/nfs/callback.c136
-rw-r--r--fs/nfs/callback.h12
-rw-r--r--fs/nfs/callback_proc.c16
-rw-r--r--fs/nfs/callback_xdr.c53
-rw-r--r--fs/nfs/client.c10
-rw-r--r--fs/nfs/delegation.c213
-rw-r--r--fs/nfs/delegation.h8
-rw-r--r--fs/nfs/dir.c24
-rw-r--r--fs/nfs/direct.c2
-rw-r--r--fs/nfs/file.c11
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c2
-rw-r--r--fs/nfs/internal.h7
-rw-r--r--fs/nfs/netns.h2
-rw-r--r--fs/nfs/nfs42proc.c1
-rw-r--r--fs/nfs/nfs4_fs.h15
-rw-r--r--fs/nfs/nfs4client.c118
-rw-r--r--fs/nfs/nfs4proc.c901
-rw-r--r--fs/nfs/nfs4session.h2
-rw-r--r--fs/nfs/nfs4state.c84
-rw-r--r--fs/nfs/nfs4xdr.c42
-rw-r--r--fs/nfs/pnfs.c73
-rw-r--r--fs/nfs/pnfs.h5
-rw-r--r--fs/nfs/pnfs_nfs.c58
-rw-r--r--fs/nfs/super.c10
-rw-r--r--fs/nfsd/flexfilelayout.c1
-rw-r--r--fs/nfsd/netns.h1
-rw-r--r--fs/nfsd/nfs4callback.c64
-rw-r--r--fs/nfsd/nfs4layouts.c6
-rw-r--r--fs/nfsd/nfs4proc.c90
-rw-r--r--fs/nfsd/nfs4state.c237
-rw-r--r--fs/nfsd/nfs4xdr.c65
-rw-r--r--fs/nfsd/nfsctl.c2
-rw-r--r--fs/nfsd/nfsproc.c3
-rw-r--r--fs/nfsd/nfssvc.c18
-rw-r--r--fs/nfsd/pnfs.h1
-rw-r--r--fs/nfsd/state.h22
-rw-r--r--fs/nfsd/vfs.c16
-rw-r--r--fs/nfsd/vfs.h2
-rw-r--r--fs/nfsd/xdr4.h23
-rw-r--r--fs/nfsd/xdr4cb.h9
-rw-r--r--fs/open.c5
-rw-r--r--fs/xfs/Makefile7
-rw-r--r--fs/xfs/libxfs/xfs_ag_resv.c15
-rw-r--r--fs/xfs/libxfs/xfs_alloc.c23
-rw-r--r--fs/xfs/libxfs/xfs_bmap.c575
-rw-r--r--fs/xfs/libxfs/xfs_bmap.h67
-rw-r--r--fs/xfs/libxfs/xfs_bmap_btree.c18
-rw-r--r--fs/xfs/libxfs/xfs_btree.c8
-rw-r--r--fs/xfs/libxfs/xfs_btree.h16
-rw-r--r--fs/xfs/libxfs/xfs_defer.h2
-rw-r--r--fs/xfs/libxfs/xfs_format.h97
-rw-r--r--fs/xfs/libxfs/xfs_fs.h10
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.c24
-rw-r--r--fs/xfs/libxfs/xfs_inode_buf.h1
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.c70
-rw-r--r--fs/xfs/libxfs/xfs_inode_fork.h28
-rw-r--r--fs/xfs/libxfs/xfs_log_format.h118
-rw-r--r--fs/xfs/libxfs/xfs_refcount.c1698
-rw-r--r--fs/xfs/libxfs/xfs_refcount.h70
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.c451
-rw-r--r--fs/xfs/libxfs/xfs_refcount_btree.h74
-rw-r--r--fs/xfs/libxfs/xfs_rmap.c914
-rw-r--r--fs/xfs/libxfs/xfs_rmap.h7
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.c82
-rw-r--r--fs/xfs/libxfs/xfs_rmap_btree.h7
-rw-r--r--fs/xfs/libxfs/xfs_sb.c9
-rw-r--r--fs/xfs/libxfs/xfs_shared.h2
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.c23
-rw-r--r--fs/xfs/libxfs/xfs_trans_resv.h3
-rw-r--r--fs/xfs/libxfs/xfs_trans_space.h9
-rw-r--r--fs/xfs/libxfs/xfs_types.h3
-rw-r--r--fs/xfs/xfs_aops.c222
-rw-r--r--fs/xfs/xfs_aops.h4
-rw-r--r--fs/xfs/xfs_bmap_item.c508
-rw-r--r--fs/xfs/xfs_bmap_item.h98
-rw-r--r--fs/xfs/xfs_bmap_util.c589
-rw-r--r--fs/xfs/xfs_dir2_readdir.c3
-rw-r--r--fs/xfs/xfs_error.h10
-rw-r--r--fs/xfs/xfs_file.c221
-rw-r--r--fs/xfs/xfs_fsops.c107
-rw-r--r--fs/xfs/xfs_fsops.h3
-rw-r--r--fs/xfs/xfs_globals.c5
-rw-r--r--fs/xfs/xfs_icache.c243
-rw-r--r--fs/xfs/xfs_icache.h7
-rw-r--r--fs/xfs/xfs_inode.c51
-rw-r--r--fs/xfs/xfs_inode.h19
-rw-r--r--fs/xfs/xfs_inode_item.c2
-rw-r--r--fs/xfs/xfs_ioctl.c75
-rw-r--r--fs/xfs/xfs_iomap.c35
-rw-r--r--fs/xfs/xfs_iomap.h3
-rw-r--r--fs/xfs/xfs_iops.c1
-rw-r--r--fs/xfs/xfs_itable.c8
-rw-r--r--fs/xfs/xfs_linux.h1
-rw-r--r--fs/xfs/xfs_log_recover.c357
-rw-r--r--fs/xfs/xfs_mount.c32
-rw-r--r--fs/xfs/xfs_mount.h8
-rw-r--r--fs/xfs/xfs_ondisk.h3
-rw-r--r--fs/xfs/xfs_pnfs.c7
-rw-r--r--fs/xfs/xfs_refcount_item.c539
-rw-r--r--fs/xfs/xfs_refcount_item.h101
-rw-r--r--fs/xfs/xfs_reflink.c1688
-rw-r--r--fs/xfs/xfs_reflink.h58
-rw-r--r--fs/xfs/xfs_rmap_item.c12
-rw-r--r--fs/xfs/xfs_stats.c1
-rw-r--r--fs/xfs/xfs_stats.h18
-rw-r--r--fs/xfs/xfs_super.c87
-rw-r--r--fs/xfs/xfs_sysctl.c9
-rw-r--r--fs/xfs/xfs_sysctl.h1
-rw-r--r--fs/xfs/xfs_trace.h742
-rw-r--r--fs/xfs/xfs_trans.h29
-rw-r--r--fs/xfs/xfs_trans_bmap.c249
-rw-r--r--fs/xfs/xfs_trans_refcount.c264
-rw-r--r--fs/xfs/xfs_trans_rmap.c9
-rw-r--r--include/asm-generic/uaccess.h4
-rw-r--r--include/dt-bindings/thermal/tegra124-soctherm.h5
-rw-r--r--include/linux/amba/clcd.h63
-rw-r--r--include/linux/exportfs.h13
-rw-r--r--include/linux/falloc.h3
-rw-r--r--include/linux/mlx5/device.h13
-rw-r--r--include/linux/nfs4.h1
-rw-r--r--include/linux/nfs_fs_sb.h3
-rw-r--r--include/linux/nfs_xdr.h8
-rw-r--r--include/linux/pwm.h5
-rw-r--r--include/linux/sunrpc/auth.h1
-rw-r--r--include/linux/sunrpc/clnt.h17
-rw-r--r--include/linux/sunrpc/rpc_rdma.h39
-rw-r--r--include/linux/sunrpc/sched.h4
-rw-r--r--include/linux/sunrpc/svc_rdma.h10
-rw-r--r--include/linux/sunrpc/xdr.h12
-rw-r--r--include/linux/sunrpc/xprt.h12
-rw-r--r--include/linux/sunrpc/xprtmultipath.h2
-rw-r--r--include/linux/sunrpc/xprtrdma.h4
-rw-r--r--include/linux/thermal.h43
-rw-r--r--include/linux/watchdog.h24
-rw-r--r--include/net/bonding.h12
-rw-r--r--include/net/l3mdev.h24
-rw-r--r--include/uapi/linux/falloc.h18
-rw-r--r--include/uapi/linux/fs.h4
-rw-r--r--include/uapi/linux/nfs4.h5
-rw-r--r--include/video/exynos_mipi_dsim.h358
-rw-r--r--kernel/trace/Makefile4
-rw-r--r--mm/Makefile3
-rw-r--r--net/bridge/br_sysfs_if.c1
-rw-r--r--net/core/rtnetlink.c2
-rw-r--r--net/ipv4/route.c3
-rw-r--r--net/ipv6/tcp_ipv6.c20
-rw-r--r--net/openvswitch/flow.c2
-rw-r--r--net/openvswitch/vport-internal_dev.c2
-rw-r--r--net/openvswitch/vport.c3
-rw-r--r--net/sched/act_api.c19
-rw-r--r--net/sched/cls_api.c18
-rw-r--r--net/strparser/strparser.c2
-rw-r--r--net/sunrpc/auth.c2
-rw-r--r--net/sunrpc/auth_generic.c9
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c7
-rw-r--r--net/sunrpc/auth_unix.c9
-rw-r--r--net/sunrpc/backchannel_rqst.c8
-rw-r--r--net/sunrpc/cache.c5
-rw-r--r--net/sunrpc/clnt.c132
-rw-r--r--net/sunrpc/sched.c35
-rw-r--r--net/sunrpc/svc.c17
-rw-r--r--net/sunrpc/xdr.c11
-rw-r--r--net/sunrpc/xprt.c2
-rw-r--r--net/sunrpc/xprtmultipath.c24
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c53
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c7
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c28
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c323
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c21
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c2
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c82
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c60
-rw-r--r--net/sunrpc/xprtrdma/transport.c202
-rw-r--r--net/sunrpc/xprtrdma/verbs.c237
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h108
-rw-r--r--net/sunrpc/xprtsock.c34
-rw-r--r--net/tipc/udp_media.c2
436 files changed, 22829 insertions, 7557 deletions
diff --git a/Documentation/DocBook/kernel-hacking.tmpl b/Documentation/DocBook/kernel-hacking.tmpl
index 589b40cc5eb5..2a272275c81b 100644
--- a/Documentation/DocBook/kernel-hacking.tmpl
+++ b/Documentation/DocBook/kernel-hacking.tmpl
@@ -483,7 +483,7 @@ printk(KERN_INFO "my ip: %pI4\n", &ipaddress);
     <function>get_user()</function>
     /
     <function>put_user()</function>
-    <filename class="headerfile">include/asm/uaccess.h</filename>
+    <filename class="headerfile">include/linux/uaccess.h</filename>
    </title>  
 
    <para>
diff --git a/Documentation/devicetree/bindings/pwm/pwm-meson.txt b/Documentation/devicetree/bindings/pwm/pwm-meson.txt
new file mode 100644
index 000000000000..5376a4468cb6
--- /dev/null
+++ b/Documentation/devicetree/bindings/pwm/pwm-meson.txt
@@ -0,0 +1,23 @@
+Amlogic Meson PWM Controller
+============================
+
+Required properties:
+- compatible: Shall contain "amlogic,meson8b-pwm" or "amlogic,meson-gxbb-pwm".
+- #pwm-cells: Should be 3. See pwm.txt in this directory for a description of
+  the cells format.
+
+Optional properties:
+- clocks: Could contain one or two parents clocks phandle for each of the two
+  PWM channels.
+- clock-names: Could contain at least the "clkin0" and/or "clkin1" names.
+
+Example:
+
+	pwm_ab: pwm@8550 {
+		compatible = "amlogic,meson-gxbb-pwm";
+		reg = <0x0 0x08550 0x0 0x10>;
+		#pwm-cells = <3>;
+		status = "disabled";
+		clocks = <&xtal>, <&xtal>;
+		clock-names = "clkin0", "clkin1";
+	}
diff --git a/Documentation/devicetree/bindings/pwm/pwm-mtk-disp.txt b/Documentation/devicetree/bindings/pwm/pwm-mtk-disp.txt
index f8f59baf6b67..6f8af2bcc7b7 100644
--- a/Documentation/devicetree/bindings/pwm/pwm-mtk-disp.txt
+++ b/Documentation/devicetree/bindings/pwm/pwm-mtk-disp.txt
@@ -2,8 +2,9 @@ MediaTek display PWM controller
 
 Required properties:
  - compatible: should be "mediatek,<name>-disp-pwm":
-   - "mediatek,mt8173-disp-pwm": found on mt8173 SoC.
+   - "mediatek,mt2701-disp-pwm": found on mt2701 SoC.
    - "mediatek,mt6595-disp-pwm": found on mt6595 SoC.
+   - "mediatek,mt8173-disp-pwm": found on mt8173 SoC.
  - reg: physical base address and length of the controller's registers.
  - #pwm-cells: must be 2. See pwm.txt in this directory for a description of
    the cell format.
diff --git a/Documentation/devicetree/bindings/pwm/pwm-st.txt b/Documentation/devicetree/bindings/pwm/pwm-st.txt
index 84d2fb807d3c..19fce774cafa 100644
--- a/Documentation/devicetree/bindings/pwm/pwm-st.txt
+++ b/Documentation/devicetree/bindings/pwm/pwm-st.txt
@@ -13,13 +13,14 @@ Required parameters:
 - pinctrl-0: 		List of phandles pointing to pin configuration nodes
 			for PWM module.
 			For Pinctrl properties, please refer to [1].
-- clock-names: 		Set to "pwm".
+- clock-names: 		Valid entries are "pwm" and/or "capture".
 - clocks: 		phandle of the clock used by the PWM module.
 			For Clk properties, please refer to [2].
+- interrupts:		IRQ for the Capture device
 
 Optional properties:
-- st,pwm-num-chan:	Number of available channels. If not passed, the driver
-			will consider single channel by default.
+- st,pwm-num-chan:	Number of available PWM channels.  Default is 0.
+- st,capture-num-chan:	Number of available Capture channels.  Default is 0.
 
 [1] Documentation/devicetree/bindings/pinctrl/pinctrl-bindings.txt
 [2] Documentation/devicetree/bindings/clock/clock-bindings.txt
@@ -38,4 +39,5 @@ pwm1: pwm@fe510000 {
 	clocks = <&clk_sysin>;
 	clock-names = "pwm";
 	st,pwm-num-chan = <4>;
+	st,capture-num-chan = <2>;
 };
diff --git a/Documentation/devicetree/bindings/pwm/pwm-sun4i.txt b/Documentation/devicetree/bindings/pwm/pwm-sun4i.txt
index cf6068b8e974..f1cbeefb3087 100644
--- a/Documentation/devicetree/bindings/pwm/pwm-sun4i.txt
+++ b/Documentation/devicetree/bindings/pwm/pwm-sun4i.txt
@@ -6,6 +6,7 @@ Required properties:
     - "allwinner,sun5i-a10s-pwm"
     - "allwinner,sun5i-a13-pwm"
     - "allwinner,sun7i-a20-pwm"
+    - "allwinner,sun8i-h3-pwm"
   - reg: physical base address and length of the controller's registers
   - #pwm-cells: should be 3. See pwm.txt in this directory for a description of
     the cells format.
diff --git a/Documentation/devicetree/bindings/thermal/max77620_thermal.txt b/Documentation/devicetree/bindings/thermal/max77620_thermal.txt
new file mode 100644
index 000000000000..323a3b3822aa
--- /dev/null
+++ b/Documentation/devicetree/bindings/thermal/max77620_thermal.txt
@@ -0,0 +1,70 @@
+Thermal driver for MAX77620 Power management IC from Maxim Semiconductor.
+
+Maxim Semiconductor MAX77620 supports alarm interrupts when its
+die temperature crosses 120C and 140C. These threshold temperatures
+are not configurable. Device does not provide the real temperature
+of die other than just indicating whether temperature is above or
+below threshold level.
+
+Required properties:
+-------------------
+#thermal-sensor-cells:	Please refer <devicetree/bindings/thermal/thermal.txt>
+			for more details.
+			The value must be 0.
+
+For more details, please refer generic thermal DT binding document
+<devicetree/bindings/thermal/thermal.txt>.
+
+Please refer <devicetree/bindings/mfd/max77620.txt> for mfd DT binding
+document for the MAX77620.
+
+Example:
+--------
+#include <dt-bindings/mfd/max77620.h>
+#include <dt-bindings/thermal/thermal.h>
+...
+
+i2c@7000d000 {
+	spmic: max77620@3c {
+		compatible = "maxim,max77620";
+		:::::
+		#thermal-sensor-cells = <0>;
+		:::
+	};
+};
+
+cool_dev: cool-dev {
+	compatible = "cooling-dev";
+	#cooling-cells = <2>;
+};
+
+thermal-zones {
+	PMIC-Die {
+		polling-delay = <0>;
+		polling-delay-passive = <0>;
+		thermal-sensors = <&spmic>;
+
+		trips {
+			pmic_die_warn_temp_thresh: hot-die {
+				temperature = <120000>;
+				type = "hot";
+				hysteresis = <0>;
+			};
+
+			pmic_die_cirt_temp_thresh: cirtical-die {
+				temperature = <140000>;
+				type = "critical";
+				hysteresis = <0>;
+			};
+		};
+
+		cooling-maps {
+			map0 {
+				trip = <&pmic_die_warn_temp_thresh>;
+				cooling-device = <&cool_dev THERMAL_NO_LIMIT
+						  THERMAL_NO_LIMIT>;
+				contribution = <100>;
+			};
+		};
+	};
+};
diff --git a/Documentation/devicetree/bindings/thermal/mediatek-thermal.txt b/Documentation/devicetree/bindings/thermal/mediatek-thermal.txt
index 81f9a512bc2a..e2f494d74d8a 100644
--- a/Documentation/devicetree/bindings/thermal/mediatek-thermal.txt
+++ b/Documentation/devicetree/bindings/thermal/mediatek-thermal.txt
@@ -8,7 +8,9 @@ apmixedsys register space via AHB bus accesses, so a phandle to the APMIXEDSYS
 is also needed.
 
 Required properties:
-- compatible: "mediatek,mt8173-thermal"
+- compatible:
+  - "mediatek,mt8173-thermal" : For MT8173 family of SoCs
+  - "mediatek,mt2701-thermal" : For MT2701 family of SoCs
 - reg: Address range of the thermal controller
 - interrupts: IRQ for the thermal controller
 - clocks, clock-names: Clocks needed for the thermal controller. required
diff --git a/Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.txt b/Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.txt
index edebfa0a985e..b6c0ae53d4dc 100644
--- a/Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.txt
+++ b/Documentation/devicetree/bindings/thermal/nvidia,tegra124-soctherm.txt
@@ -10,8 +10,14 @@ Required properties :
 - compatible : For Tegra124, must contain "nvidia,tegra124-soctherm".
   For Tegra132, must contain "nvidia,tegra132-soctherm".
   For Tegra210, must contain "nvidia,tegra210-soctherm".
-- reg : Should contain 1 entry:
+- reg : Should contain at least 2 entries for each entry in reg-names:
   - SOCTHERM register set
+  - Tegra CAR register set: Required for Tegra124 and Tegra210.
+  - CCROC register set: Required for Tegra132.
+- reg-names :  Should contain at least 2 entries:
+  - soctherm-reg
+  - car-reg
+  - ccroc-reg
 - interrupts : Defines the interrupt used by SOCTHERM
 - clocks : Must contain an entry for each entry in clock-names.
   See ../clocks/clock-bindings.txt for details.
@@ -25,17 +31,45 @@ Required properties :
 - #thermal-sensor-cells : Should be 1. See ./thermal.txt for a description
     of this property. See <dt-bindings/thermal/tegra124-soctherm.h> for a
     list of valid values when referring to thermal sensors.
+- throttle-cfgs: A sub-node which is a container of configuration for each
+    hardware throttle events. These events can be set as cooling devices.
+  * throttle events: Sub-nodes must be named as "light" or "heavy".
+      Properties:
+      - nvidia,priority: Each throttles has its own throttle settings, so the
+        SW need to set priorities for various throttle, the HW arbiter can select
+        the final throttle settings.
+        Bigger value indicates higher priority, In general, higher priority
+        translates to lower target frequency. SW needs to ensure that critical
+        thermal alarms are given higher priority, and ensure that there is
+        no race if priority of two vectors is set to the same value.
+        The range of this value is 1~100.
+      - nvidia,cpu-throt-percent: This property is for Tegra124 and Tegra210.
+        It is the throttling depth of pulse skippers, it's the percentage
+        throttling.
+      - nvidia,cpu-throt-level: This property is only for Tegra132, it is the
+        level of pulse skippers, which used to throttle clock frequencies. It
+        indicates cpu clock throttling depth, and the depth can be programmed.
+        Must set as following values:
+        TEGRA_SOCTHERM_THROT_LEVEL_LOW, TEGRA_SOCTHERM_THROT_LEVEL_MED
+        TEGRA_SOCTHERM_THROT_LEVEL_HIGH, TEGRA_SOCTHERM_THROT_LEVEL_NONE
+      - #cooling-cells: Should be 1. This cooling device only support on/off state.
+        See ./thermal.txt for a description of this property.
 
 Note:
 - the "critical" type trip points will be set to SOC_THERM hardware as the
 shut down temperature. Once the temperature of this thermal zone is higher
 than it, the system will be shutdown or reset by hardware.
+- the "hot" type trip points will be set to SOC_THERM hardware as the throttle
+temperature. Once the the temperature of this thermal zone is higher
+than it, it will trigger the HW throttle event.
 
 Example :
 
 	soctherm@700e2000 {
 		compatible = "nvidia,tegra124-soctherm";
-		reg = <0x0 0x700e2000 0x0 0x1000>;
+		reg = <0x0 0x700e2000 0x0 0x600  /* SOC_THERM reg_base */
+			0x0 0x60006000 0x0 0x400 /* CAR reg_base */
+		reg-names = "soctherm-reg", "car-reg";
 		interrupts = <GIC_SPI 48 IRQ_TYPE_LEVEL_HIGH>;
 		clocks = <&tegra_car TEGRA124_CLK_TSENSOR>,
 			<&tegra_car TEGRA124_CLK_SOC_THERM>;
@@ -44,6 +78,76 @@ Example :
 		reset-names = "soctherm";
 
 		#thermal-sensor-cells = <1>;
+
+		throttle-cfgs {
+			/*
+			 * When the "heavy" cooling device triggered,
+			 * the HW will skip cpu clock's pulse in 85% depth
+			 */
+			throttle_heavy: heavy {
+				nvidia,priority = <100>;
+				nvidia,cpu-throt-percent = <85>;
+
+				#cooling-cells = <1>;
+			};
+
+			/*
+			 * When the "light" cooling device triggered,
+			 * the HW will skip cpu clock's pulse in 50% depth
+			 */
+			throttle_light: light {
+				nvidia,priority = <80>;
+				nvidia,cpu-throt-percent = <50>;
+
+				#cooling-cells = <1>;
+			};
+
+			/*
+			 * If these two devices are triggered in same time, the HW throttle
+			 * arbiter will select the highest priority as the final throttle
+			 * settings to skip cpu pulse.
+			 */
+		};
+	};
+
+Example: referring to Tegra132's "reg", "reg-names" and "throttle-cfgs" :
+
+	soctherm@700e2000 {
+		compatible = "nvidia,tegra132-soctherm";
+		reg = <0x0 0x700e2000 0x0 0x600  /* SOC_THERM reg_base */
+			0x0 0x70040000 0x0 0x200>; /* CCROC reg_base */;
+		reg-names = "soctherm-reg", "ccroc-reg";
+
+		throttle-cfgs {
+			/*
+			 * When the "heavy" cooling device triggered,
+			 * the HW will skip cpu clock's pulse in HIGH level
+			 */
+			throttle_heavy: heavy {
+				nvidia,priority = <100>;
+				nvidia,cpu-throt-level = <TEGRA_SOCTHERM_THROT_LEVEL_HIGH>;
+
+				#cooling-cells = <1>;
+			};
+
+			/*
+			 * When the "light" cooling device triggered,
+			 * the HW will skip cpu clock's pulse in MED level
+			 */
+			throttle_light: light {
+				nvidia,priority = <80>;
+				nvidia,cpu-throt-level = <TEGRA_SOCTHERM_THROT_LEVEL_MED>;
+
+				#cooling-cells = <1>;
+			};
+
+			/*
+			 * If these two devices are triggered in same time, the HW throttle
+			 * arbiter will select the highest priority as the final throttle
+			 * settings to skip cpu pulse.
+			 */
+
+		};
 	};
 
 Example: referring to thermal sensors :
@@ -62,6 +166,19 @@ Example: referring to thermal sensors :
 					hysteresis = <1000>;
 					type = "critical";
 				};
+
+				cpu_throttle_trip: throttle-trip {
+					temperature = <100000>;
+					hysteresis = <1000>;
+					type = "hot";
+				};
+			};
+
+			cooling-maps {
+				map0 {
+					trip = <&cpu_throttle_trip>;
+					cooling-device = <&throttle_heavy 1 1>;
+				};
 			};
                 };
 	};
diff --git a/Documentation/devicetree/bindings/thermal/qcom-tsens.txt b/Documentation/devicetree/bindings/thermal/qcom-tsens.txt
new file mode 100644
index 000000000000..292ed89d900b
--- /dev/null
+++ b/Documentation/devicetree/bindings/thermal/qcom-tsens.txt
@@ -0,0 +1,21 @@
+* QCOM SoC Temperature Sensor (TSENS)
+
+Required properties:
+- compatible :
+ - "qcom,msm8916-tsens" : For 8916 Family of SoCs
+ - "qcom,msm8974-tsens" : For 8974 Family of SoCs
+ - "qcom,msm8996-tsens" : For 8996 Family of SoCs
+
+- reg: Address range of the thermal registers
+- #thermal-sensor-cells : Should be 1. See ./thermal.txt for a description.
+- Refer to Documentation/devicetree/bindings/nvmem/nvmem.txt to know how to specify
+nvmem cells
+
+Example:
+tsens: thermal-sensor@900000 {
+		compatible = "qcom,msm8916-tsens";
+		reg = <0x4a8000 0x2000>;
+		nvmem-cells = <&tsens_caldata>, <&tsens_calsel>;
+		nvmem-cell-names = "caldata", "calsel";
+		#thermal-sensor-cells = <1>;
+	};
diff --git a/Documentation/devicetree/bindings/watchdog/of-xilinx-wdt.txt b/Documentation/devicetree/bindings/watchdog/of-xilinx-wdt.txt
index 6d63782a7378..c6ae9c9d5e3e 100644
--- a/Documentation/devicetree/bindings/watchdog/of-xilinx-wdt.txt
+++ b/Documentation/devicetree/bindings/watchdog/of-xilinx-wdt.txt
@@ -7,6 +7,8 @@ Required properties:
 - reg			: Physical base address and size
 
 Optional properties:
+- clocks		: Input clock specifier. Refer to common clock
+			  bindings.
 - clock-frequency	: Frequency of clock in Hz
 - xlnx,wdt-enable-once	: 0 - Watchdog can be restarted
 			  1 - Watchdog can be enabled just once
@@ -17,6 +19,7 @@ Example:
 axi-timebase-wdt@40100000 {
 	clock-frequency = <50000000>;
 	compatible = "xlnx,xps-timebase-wdt-1.00.a";
+	clocks = <&clkc 15>;
 	reg = <0x40100000 0x10000>;
 	xlnx,wdt-enable-once = <0x0>;
 	xlnx,wdt-interval = <0x1b>;
diff --git a/Documentation/devicetree/bindings/watchdog/st_lpc_wdt.txt b/Documentation/devicetree/bindings/watchdog/st_lpc_wdt.txt
index 039c5ca45577..b949039bc502 100644
--- a/Documentation/devicetree/bindings/watchdog/st_lpc_wdt.txt
+++ b/Documentation/devicetree/bindings/watchdog/st_lpc_wdt.txt
@@ -9,8 +9,7 @@ functionality.
 
 Required properties
 
-- compatible 	: Must be one of: "st,stih407-lpc" "st,stih416-lpc"
-				  "st,stih415-lpc" "st,stid127-lpc"
+- compatible 	: Should be: "st,stih407-lpc"
 - reg		: LPC registers base address + size
 - interrupts    : LPC interrupt line number and associated flags
 - clocks	: Clock used by LPC device (See: ../clock/clock-bindings.txt)
diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index a1489e14f8ee..58f3c1041759 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2470,6 +2470,11 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 	nfsrootdebug	[NFS] enable nfsroot debugging messages.
 			See Documentation/filesystems/nfs/nfsroot.txt.
 
+	nfs.callback_nr_threads=
+			[NFSv4] set the total number of threads that the
+			NFS client will assign to service NFSv4 callback
+			requests.
+
 	nfs.callback_tcpport=
 			[NFS] set the TCP port on which the NFSv4 callback
 			channel should listen.
@@ -2493,6 +2498,13 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			of returning the full 64-bit number.
 			The default is to return 64-bit inode numbers.
 
+	nfs.max_session_cb_slots=
+			[NFSv4.1] Sets the maximum number of session
+			slots the client will assign to the callback
+			channel. This determines the maximum number of
+			callbacks the client will process in parallel for
+			a particular server.
+
 	nfs.max_session_slots=
 			[NFSv4.1] Sets the maximum number of session slots
 			the client will attempt to negotiate with the server.
diff --git a/Documentation/thermal/sysfs-api.txt b/Documentation/thermal/sysfs-api.txt
index efc3f3d293c4..ef473dc7f55e 100644
--- a/Documentation/thermal/sysfs-api.txt
+++ b/Documentation/thermal/sysfs-api.txt
@@ -49,6 +49,9 @@ temperature) and throttle appropriate devices.
 	.bind: bind the thermal zone device with a thermal cooling device.
 	.unbind: unbind the thermal zone device with a thermal cooling device.
 	.get_temp: get the current temperature of the thermal zone.
+	.set_trips: set the trip points window. Whenever the current temperature
+		    is updated, the trip points immediately below and above the
+		    current temperature are found.
 	.get_mode: get the current mode (enabled/disabled) of the thermal zone.
 	    - "enabled" means the kernel thermal management is enabled.
 	    - "disabled" will prevent kernel thermal driver action upon trip points
@@ -95,6 +98,10 @@ temperature) and throttle appropriate devices.
 			get_temp:	a pointer to a function that reads the
 					sensor temperature. This is mandatory
 					callback provided by sensor driver.
+			set_trips:      a pointer to a function that sets a
+					temperature window. When this window is
+					left the driver must inform the thermal
+					core via thermal_zone_device_update.
 			get_trend: 	a pointer to a function that reads the
 					sensor temperature trend.
 			set_emul_temp:	a pointer to a function that sets
@@ -140,6 +147,18 @@ temperature) and throttle appropriate devices.
 	Normally this function will not need to be called and the resource
 	management code will ensure that the resource is freed.
 
+1.1.7 int thermal_zone_get_slope(struct thermal_zone_device *tz)
+
+	This interface is used to read the slope attribute value
+	for the thermal zone device, which might be useful for platform
+	drivers for temperature calculations.
+
+1.1.8 int thermal_zone_get_offset(struct thermal_zone_device *tz)
+
+	This interface is used to read the offset attribute value
+	for the thermal zone device, which might be useful for platform
+	drivers for temperature calculations.
+
 1.2 thermal cooling device interface
 1.2.1 struct thermal_cooling_device *thermal_cooling_device_register(char *name,
 		void *devdata, struct thermal_cooling_device_ops *)
diff --git a/Documentation/watchdog/watchdog-kernel-api.txt b/Documentation/watchdog/watchdog-kernel-api.txt
index 7f31125c123e..ea277478982f 100644
--- a/Documentation/watchdog/watchdog-kernel-api.txt
+++ b/Documentation/watchdog/watchdog-kernel-api.txt
@@ -48,8 +48,10 @@ struct watchdog_device {
 	const struct attribute_group **groups;
 	const struct watchdog_info *info;
 	const struct watchdog_ops *ops;
+	const struct watchdog_governor *gov;
 	unsigned int bootstatus;
 	unsigned int timeout;
+	unsigned int pretimeout;
 	unsigned int min_timeout;
 	unsigned int max_timeout;
 	unsigned int min_hw_heartbeat_ms;
@@ -74,9 +76,11 @@ It contains following fields:
 * info: a pointer to a watchdog_info structure. This structure gives some
   additional information about the watchdog timer itself. (Like it's unique name)
 * ops: a pointer to the list of watchdog operations that the watchdog supports.
+* gov: a pointer to the assigned watchdog device pretimeout governor or NULL.
 * timeout: the watchdog timer's timeout value (in seconds).
   This is the time after which the system will reboot if user space does
   not send a heartbeat request if WDOG_ACTIVE is set.
+* pretimeout: the watchdog timer's pretimeout value (in seconds).
 * min_timeout: the watchdog timer's minimum timeout value (in seconds).
   If set, the minimum configurable value for 'timeout'.
 * max_timeout: the watchdog timer's maximum timeout value (in seconds),
@@ -121,6 +125,7 @@ struct watchdog_ops {
 	int (*ping)(struct watchdog_device *);
 	unsigned int (*status)(struct watchdog_device *);
 	int (*set_timeout)(struct watchdog_device *, unsigned int);
+	int (*set_pretimeout)(struct watchdog_device *, unsigned int);
 	unsigned int (*get_timeleft)(struct watchdog_device *);
 	int (*restart)(struct watchdog_device *);
 	void (*ref)(struct watchdog_device *) __deprecated;
@@ -188,6 +193,23 @@ they are supported. These optional routines/operations are:
   If set_timeout is not provided but, WDIOF_SETTIMEOUT is set, the watchdog
   infrastructure updates the timeout value of the watchdog_device internally
   to the requested value.
+  If the pretimeout feature is used (WDIOF_PRETIMEOUT), then set_timeout must
+  also take care of checking if pretimeout is still valid and set up the timer
+  accordingly. This can't be done in the core without races, so it is the
+  duty of the driver.
+* set_pretimeout: this routine checks and changes the pretimeout value of
+  the watchdog. It is optional because not all watchdogs support pretimeout
+  notification. The timeout value is not an absolute time, but the number of
+  seconds before the actual timeout would happen. It returns 0 on success,
+  -EINVAL for "parameter out of range" and -EIO for "could not write value to
+  the watchdog". A value of 0 disables pretimeout notification.
+  (Note: the WDIOF_PRETIMEOUT needs to be set in the options field of the
+  watchdog's info structure).
+  If the watchdog driver does not have to perform any action but setting the
+  watchdog_device.pretimeout, this callback can be omitted. That means if
+  set_pretimeout is not provided but WDIOF_PRETIMEOUT is set, the watchdog
+  infrastructure updates the pretimeout value of the watchdog_device internally
+  to the requested value.
 * get_timeleft: this routines returns the time that's left before a reset.
 * restart: this routine restarts the machine. It returns 0 on success or a
   negative errno code for failure.
@@ -268,3 +290,14 @@ User should follow the following guidelines for setting the priority:
 * 128: default restart handler, use if no other handler is expected to be
   available, and/or if restart is sufficient to restart the entire system
 * 255: highest priority, will preempt all other restart handlers
+
+To raise a pretimeout notification, the following function should be used:
+
+void watchdog_notify_pretimeout(struct watchdog_device *wdd)
+
+The function can be called in the interrupt context. If watchdog pretimeout
+governor framework (kbuild CONFIG_WATCHDOG_PRETIMEOUT_GOV symbol) is enabled,
+an action is taken by a preconfigured pretimeout governor preassigned to
+the watchdog device. If watchdog pretimeout governor framework is not
+enabled, watchdog_notify_pretimeout() prints a notification message to
+the kernel log buffer.
diff --git a/MAINTAINERS b/MAINTAINERS
index 5e925a25e77d..1fc66f0aceb5 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -4775,15 +4775,6 @@ L:	iommu@lists.linux-foundation.org
 S:	Maintained
 F:	drivers/iommu/exynos-iommu.c
 
-EXYNOS MIPI DISPLAY DRIVERS
-M:	Inki Dae <inki.dae@samsung.com>
-M:	Donghwa Lee <dh09.lee@samsung.com>
-M:	Kyungmin Park <kyungmin.park@samsung.com>
-L:	linux-fbdev@vger.kernel.org
-S:	Maintained
-F:	drivers/video/fbdev/exynos/exynos_mipi*
-F:	include/video/exynos_mipi*
-
 EZchip NPS platform support
 M:	Noam Camus <noamc@ezchip.com>
 S:	Supported
@@ -4962,12 +4953,9 @@ F:	drivers/net/wan/dlci.c
 F:	drivers/net/wan/sdla.c
 
 FRAMEBUFFER LAYER
-M:	Jean-Christophe Plagniol-Villard <plagnioj@jcrosoft.com>
 M:	Tomi Valkeinen <tomi.valkeinen@ti.com>
 L:	linux-fbdev@vger.kernel.org
-W:	http://linux-fbdev.sourceforge.net/
 Q:	http://patchwork.kernel.org/project/linux-fbdev/list/
-T:	git git://git.kernel.org/pub/scm/linux/kernel/git/plagnioj/linux-fbdev.git
 S:	Maintained
 F:	Documentation/fb/
 F:	drivers/video/
@@ -9201,6 +9189,14 @@ S:	Maintained
 F:	Documentation/devicetree/bindings/pci/versatile.txt
 F:	drivers/pci/host/pci-versatile.c
 
+PCI DRIVER FOR ARMADA 8K
+M:	Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
+L:	linux-pci@vger.kernel.org
+L:	linux-arm-kernel@lists.infradead.org
+S:	Maintained
+F:	Documentation/devicetree/bindings/pci/pci-armada8k.txt
+F:	drivers/pci/host/pcie-armada8k.c
+
 PCI DRIVER FOR APPLIEDMICRO XGENE
 M:	Tanmay Inamdar <tinamdar@apm.com>
 L:	linux-pci@vger.kernel.org
@@ -9247,6 +9243,7 @@ M:	Thomas Petazzoni <thomas.petazzoni@free-electrons.com>
 L:	linux-pci@vger.kernel.org
 L:	linux-arm-kernel@lists.infradead.org (moderated for non-subscribers)
 S:	Maintained
+F:	Documentation/devicetree/bindings/pci/aardvark-pci.txt
 F:	drivers/pci/host/pci-aardvark.c
 
 PCI DRIVER FOR NVIDIA TEGRA
diff --git a/Makefile b/Makefile
index addb235b537c..27f97b53e6eb 100644
--- a/Makefile
+++ b/Makefile
@@ -621,6 +621,7 @@ include arch/$(SRCARCH)/Makefile
 
 KBUILD_CFLAGS	+= $(call cc-option,-fno-delete-null-pointer-checks,)
 KBUILD_CFLAGS	+= $(call cc-disable-warning,maybe-uninitialized,)
+KBUILD_CFLAGS	+= $(call cc-disable-warning,frame-address,)
 
 ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE
 KBUILD_CFLAGS	+= -Os
diff --git a/arch/arm/boot/dts/tegra124-jetson-tk1.dts b/arch/arm/boot/dts/tegra124-jetson-tk1.dts
index e52b82449a79..53994f9fbbcc 100644
--- a/arch/arm/boot/dts/tegra124-jetson-tk1.dts
+++ b/arch/arm/boot/dts/tegra124-jetson-tk1.dts
@@ -2045,44 +2045,32 @@
 	thermal-zones {
 		cpu {
 			trips {
-				trip {
+				cpu-shutdown-trip {
 					temperature = <101000>;
 					hysteresis = <0>;
 					type = "critical";
 				};
 			};
-
-			cooling-maps {
-				/* There are currently no cooling maps because there are no cooling devices */
-			};
 		};
 
 		mem {
 			trips {
-				trip {
+				mem-shutdown-trip {
 					temperature = <101000>;
 					hysteresis = <0>;
 					type = "critical";
 				};
 			};
-
-			cooling-maps {
-				/* There are currently no cooling maps because there are no cooling devices */
-			};
 		};
 
 		gpu {
 			trips {
-				trip {
+				gpu-shutdown-trip {
 					temperature = <101000>;
 					hysteresis = <0>;
 					type = "critical";
 				};
 			};
-
-			cooling-maps {
-				/* There are currently no cooling maps because there are no cooling devices */
-			};
 		};
 	};
 };
diff --git a/arch/arm/boot/dts/tegra124.dtsi b/arch/arm/boot/dts/tegra124.dtsi
index ea340f9de448..187a36c6d0fc 100644
--- a/arch/arm/boot/dts/tegra124.dtsi
+++ b/arch/arm/boot/dts/tegra124.dtsi
@@ -851,7 +851,9 @@
 
 	soctherm: thermal-sensor@700e2000 {
 		compatible = "nvidia,tegra124-soctherm";
-		reg = <0x0 0x700e2000 0x0 0x1000>;
+		reg = <0x0 0x700e2000 0x0 0x600 /* SOC_THERM reg_base */
+			0x0 0x60006000 0x0 0x400>; /* CAR reg_base */
+		reg-names = "soctherm-reg", "car-reg";
 		interrupts = <GIC_SPI 48 IRQ_TYPE_LEVEL_HIGH>;
 		clocks = <&tegra_car TEGRA124_CLK_TSENSOR>,
 			<&tegra_car TEGRA124_CLK_SOC_THERM>;
@@ -859,6 +861,15 @@
 		resets = <&tegra_car 78>;
 		reset-names = "soctherm";
 		#thermal-sensor-cells = <1>;
+
+		throttle-cfgs {
+			throttle_heavy: heavy {
+				nvidia,priority = <100>;
+				nvidia,cpu-throt-percent = <85>;
+
+				#cooling-cells = <2>;
+			};
+		};
 	};
 
 	dfll: clock@70110000 {
@@ -1154,6 +1165,26 @@
 
 			thermal-sensors =
 				<&soctherm TEGRA124_SOCTHERM_SENSOR_CPU>;
+
+			trips {
+				cpu-shutdown-trip {
+					temperature = <103000>;
+					hysteresis = <0>;
+					type = "critical";
+				};
+				cpu_throttle_trip: throttle-trip {
+					temperature = <100000>;
+					hysteresis = <1000>;
+					type = "hot";
+				};
+			};
+
+			cooling-maps {
+				map0 {
+					trip = <&cpu_throttle_trip>;
+					cooling-device = <&throttle_heavy 1 1>;
+				};
+			};
 		};
 
 		mem {
@@ -1162,6 +1193,21 @@
 
 			thermal-sensors =
 				<&soctherm TEGRA124_SOCTHERM_SENSOR_MEM>;
+
+			trips {
+				mem-shutdown-trip {
+					temperature = <103000>;
+					hysteresis = <0>;
+					type = "critical";
+				};
+			};
+
+			cooling-maps {
+				/*
+				 * There are currently no cooling maps,
+				 * because there are no cooling devices.
+				 */
+			};
 		};
 
 		gpu {
@@ -1170,6 +1216,26 @@
 
 			thermal-sensors =
 				<&soctherm TEGRA124_SOCTHERM_SENSOR_GPU>;
+
+			trips {
+				gpu-shutdown-trip {
+					temperature = <101000>;
+					hysteresis = <0>;
+					type = "critical";
+				};
+				gpu_throttle_trip: throttle-trip {
+					temperature = <99000>;
+					hysteresis = <1000>;
+					type = "hot";
+				};
+			};
+
+			cooling-maps {
+				map0 {
+					trip = <&gpu_throttle_trip>;
+					cooling-device = <&throttle_heavy 1 1>;
+				};
+			};
 		};
 
 		pllx {
@@ -1178,6 +1244,21 @@
 
 			thermal-sensors =
 				<&soctherm TEGRA124_SOCTHERM_SENSOR_PLLX>;
+
+			trips {
+				pllx-shutdown-trip {
+					temperature = <103000>;
+					hysteresis = <0>;
+					type = "critical";
+				};
+			};
+
+			cooling-maps {
+				/*
+				 * There are currently no cooling maps,
+				 * because there are no cooling devices.
+				 */
+			};
 		};
 	};
 
diff --git a/arch/arm/configs/exynos_defconfig b/arch/arm/configs/exynos_defconfig
index 4e484f406419..c58f6841f8aa 100644
--- a/arch/arm/configs/exynos_defconfig
+++ b/arch/arm/configs/exynos_defconfig
@@ -168,8 +168,6 @@ CONFIG_DRM_PANEL_SAMSUNG_LD9040=y
 CONFIG_DRM_PANEL_SAMSUNG_S6E8AA0=y
 CONFIG_DRM_NXP_PTN3460=y
 CONFIG_DRM_PARADE_PS8622=y
-CONFIG_EXYNOS_VIDEO=y
-CONFIG_EXYNOS_MIPI_DSI=y
 CONFIG_LCD_CLASS_DEVICE=y
 CONFIG_LCD_PLATFORM=y
 CONFIG_BACKLIGHT_PWM=y
diff --git a/arch/arm/mm/fault.h b/arch/arm/mm/fault.h
index 05ec5e0df32d..67532f242271 100644
--- a/arch/arm/mm/fault.h
+++ b/arch/arm/mm/fault.h
@@ -23,7 +23,6 @@ static inline int fsr_fs(unsigned int fsr)
 #endif
 
 void do_bad_area(unsigned long addr, unsigned int fsr, struct pt_regs *regs);
-unsigned long search_exception_table(unsigned long addr);
 void early_abt_enable(void);
 
 #endif	/* __ARCH_ARM_FAULT_H */
diff --git a/arch/arm64/boot/dts/nvidia/tegra132.dtsi b/arch/arm64/boot/dts/nvidia/tegra132.dtsi
index 2013f8916084..3f3a46a4bd01 100644
--- a/arch/arm64/boot/dts/nvidia/tegra132.dtsi
+++ b/arch/arm64/boot/dts/nvidia/tegra132.dtsi
@@ -4,6 +4,7 @@
 #include <dt-bindings/pinctrl/pinctrl-tegra.h>
 #include <dt-bindings/pinctrl/pinctrl-tegra-xusb.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
+#include <dt-bindings/thermal/tegra124-soctherm.h>
 
 / {
 	compatible = "nvidia,tegra132", "nvidia,tegra124";
@@ -727,8 +728,10 @@
 	};
 
 	soctherm: thermal-sensor@700e2000 {
-		compatible = "nvidia,tegra124-soctherm";
-		reg = <0x0 0x700e2000 0x0 0x1000>;
+		compatible = "nvidia,tegra132-soctherm";
+		reg = <0x0 0x700e2000 0x0 0x600 /* 0: SOC_THERM reg_base */
+			0x0 0x70040000 0x0 0x200>; /* 2: CCROC reg_base */
+		reg-names = "soctherm-reg", "ccroc-reg";
 		interrupts = <GIC_SPI 48 IRQ_TYPE_LEVEL_HIGH>;
 		clocks = <&tegra_car TEGRA124_CLK_TSENSOR>,
 			<&tegra_car TEGRA124_CLK_SOC_THERM>;
@@ -736,6 +739,118 @@
 		resets = <&tegra_car 78>;
 		reset-names = "soctherm";
 		#thermal-sensor-cells = <1>;
+
+		throttle-cfgs {
+			throttle_heavy: heavy {
+				nvidia,priority = <100>;
+				nvidia,cpu-throt-level = <TEGRA_SOCTHERM_THROT_LEVEL_HIGH>;
+
+				#cooling-cells = <2>;
+			};
+		};
+	};
+
+	thermal-zones {
+		cpu {
+			polling-delay-passive = <1000>;
+			polling-delay = <0>;
+
+			thermal-sensors =
+				<&soctherm TEGRA124_SOCTHERM_SENSOR_CPU>;
+
+			trips {
+				cpu_shutdown_trip {
+					temperature = <105000>;
+					hysteresis = <1000>;
+					type = "critical";
+				};
+
+				cpu_throttle_trip: throttle-trip {
+					temperature = <102000>;
+					hysteresis = <1000>;
+					type = "hot";
+				};
+			};
+
+			cooling-maps {
+				map0 {
+					trip = <&cpu_throttle_trip>;
+					cooling-device = <&throttle_heavy 1 1>;
+				};
+			};
+		};
+		mem {
+			polling-delay-passive = <0>;
+			polling-delay = <0>;
+
+			thermal-sensors =
+				<&soctherm TEGRA124_SOCTHERM_SENSOR_MEM>;
+
+			trips {
+				mem_shutdown_trip {
+					temperature = <101000>;
+					hysteresis = <1000>;
+					type = "critical";
+				};
+			};
+
+			cooling-maps {
+				/*
+				 * There are currently no cooling maps,
+				 * because there are no cooling devices.
+				 */
+			};
+		};
+		gpu {
+			polling-delay-passive = <1000>;
+			polling-delay = <0>;
+
+			thermal-sensors =
+				<&soctherm TEGRA124_SOCTHERM_SENSOR_GPU>;
+
+			trips {
+				gpu_shutdown_trip {
+					temperature = <101000>;
+					hysteresis = <1000>;
+					type = "critical";
+				};
+
+				gpu_throttle_trip: throttle-trip {
+					temperature = <99000>;
+					hysteresis = <1000>;
+					type = "hot";
+				};
+			};
+
+			cooling-maps {
+				map0 {
+					trip = <&gpu_throttle_trip>;
+					cooling-device = <&throttle_heavy 1 1>;
+				};
+			};
+		};
+		pllx {
+			polling-delay-passive = <0>;
+			polling-delay = <0>;
+
+			thermal-sensors =
+				<&soctherm TEGRA124_SOCTHERM_SENSOR_PLLX>;
+
+			trips {
+				pllx_shutdown_trip {
+					temperature = <105000>;
+					hysteresis = <1000>;
+					type = "critical";
+				};
+			};
+
+			cooling-maps {
+				/*
+				 * There are currently no cooling maps,
+				 * because there are no cooling devices.
+				 */
+			};
+		};
 	};
 
 	ahub@70300000 {
diff --git a/arch/arm64/boot/dts/nvidia/tegra210.dtsi b/arch/arm64/boot/dts/nvidia/tegra210.dtsi
index f6739797150a..46045fe719da 100644
--- a/arch/arm64/boot/dts/nvidia/tegra210.dtsi
+++ b/arch/arm64/boot/dts/nvidia/tegra210.dtsi
@@ -3,6 +3,7 @@
 #include <dt-bindings/memory/tegra210-mc.h>
 #include <dt-bindings/pinctrl/pinctrl-tegra.h>
 #include <dt-bindings/interrupt-controller/arm-gic.h>
+#include <dt-bindings/thermal/tegra124-soctherm.h>
 
 / {
 	compatible = "nvidia,tegra210";
@@ -1159,4 +1160,130 @@
 				(GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>;
 		interrupt-parent = <&gic>;
 	};
+
+	soctherm: thermal-sensor@700e2000 {
+		compatible = "nvidia,tegra210-soctherm";
+		reg = <0x0 0x700e2000 0x0 0x600 /* SOC_THERM reg_base */
+			0x0 0x60006000 0x0 0x400>; /* CAR reg_base */
+		reg-names = "soctherm-reg", "car-reg";
+		interrupts = <GIC_SPI 48 IRQ_TYPE_LEVEL_HIGH>;
+		clocks = <&tegra_car TEGRA210_CLK_TSENSOR>,
+			<&tegra_car TEGRA210_CLK_SOC_THERM>;
+		clock-names = "tsensor", "soctherm";
+		resets = <&tegra_car 78>;
+		reset-names = "soctherm";
+		#thermal-sensor-cells = <1>;
+
+		throttle-cfgs {
+			throttle_heavy: heavy {
+				nvidia,priority = <100>;
+				nvidia,cpu-throt-percent = <85>;
+
+				#cooling-cells = <2>;
+			};
+		};
+	};
+
+	thermal-zones {
+		cpu {
+			polling-delay-passive = <1000>;
+			polling-delay = <0>;
+
+			thermal-sensors =
+				<&soctherm TEGRA124_SOCTHERM_SENSOR_CPU>;
+
+			trips {
+				cpu-shutdown-trip {
+					temperature = <102500>;
+					hysteresis = <0>;
+					type = "critical";
+				};
+
+				cpu_throttle_trip: throttle-trip {
+					temperature = <98500>;
+					hysteresis = <1000>;
+					type = "hot";
+				};
+			};
+
+			cooling-maps {
+				map0 {
+					trip = <&cpu_throttle_trip>;
+					cooling-device = <&throttle_heavy 1 1>;
+				};
+			};
+		};
+		mem {
+			polling-delay-passive = <0>;
+			polling-delay = <0>;
+
+			thermal-sensors =
+				<&soctherm TEGRA124_SOCTHERM_SENSOR_MEM>;
+
+			trips {
+				mem-shutdown-trip {
+					temperature = <103000>;
+					hysteresis = <0>;
+					type = "critical";
+				};
+			};
+
+			cooling-maps {
+				/*
+				 * There are currently no cooling maps,
+				 * because there are no cooling devices.
+				 */
+			};
+		};
+		gpu {
+			polling-delay-passive = <1000>;
+			polling-delay = <0>;
+
+			thermal-sensors =
+				<&soctherm TEGRA124_SOCTHERM_SENSOR_GPU>;
+
+			trips {
+				gpu-shutdown-trip {
+					temperature = <103000>;
+					hysteresis = <0>;
+					type = "critical";
+				};
+
+				gpu_throttle_trip: throttle-trip {
+					temperature = <100000>;
+					hysteresis = <1000>;
+					type = "hot";
+				};
+			};
+
+			cooling-maps {
+				map0 {
+					trip = <&gpu_throttle_trip>;
+					cooling-device = <&throttle_heavy 1 1>;
+				};
+			};
+		};
+		pllx {
+			polling-delay-passive = <0>;
+			polling-delay = <0>;
+
+			thermal-sensors =
+				<&soctherm TEGRA124_SOCTHERM_SENSOR_PLLX>;
+
+			trips {
+				pllx-shutdown-trip {
+					temperature = <103000>;
+					hysteresis = <0>;
+					type = "critical";
+				};
+			};
+
+			cooling-maps {
+				/*
+				 * There are currently no cooling maps,
+				 * because there are no cooling devices.
+				 */
+			};
+		};
+	};
 };
diff --git a/arch/frv/include/asm/pgtable.h b/arch/frv/include/asm/pgtable.h
index 07d7a7ef8bd5..a0513d463a1f 100644
--- a/arch/frv/include/asm/pgtable.h
+++ b/arch/frv/include/asm/pgtable.h
@@ -522,5 +522,6 @@ extern void __init pgtable_cache_init(void);
 #ifndef __ASSEMBLY__
 extern void __init paging_init(void);
 #endif /* !__ASSEMBLY__ */
+#define HAVE_ARCH_UNMAPPED_AREA
 
 #endif /* _ASM_PGTABLE_H */
diff --git a/arch/frv/include/asm/segment.h b/arch/frv/include/asm/segment.h
index 4377c89a57f5..2305142d4cf8 100644
--- a/arch/frv/include/asm/segment.h
+++ b/arch/frv/include/asm/segment.h
@@ -32,7 +32,6 @@ typedef struct {
 #define get_ds()		(KERNEL_DS)
 #define get_fs()		(__current_thread_info->addr_limit)
 #define segment_eq(a, b)	((a).seg == (b).seg)
-#define __kernel_ds_p()		segment_eq(get_fs(), KERNEL_DS)
 #define get_addr_limit()	(get_fs().seg)
 
 #define set_fs(_x)					\
diff --git a/arch/frv/include/asm/uaccess.h b/arch/frv/include/asm/uaccess.h
index 87d9e34c5df8..c0f4057eab60 100644
--- a/arch/frv/include/asm/uaccess.h
+++ b/arch/frv/include/asm/uaccess.h
@@ -20,8 +20,6 @@
 #include <asm/segment.h>
 #include <asm/sections.h>
 
-#define HAVE_ARCH_UNMAPPED_AREA	/* we decide where to put mmaps */
-
 #define __ptr(x) ((unsigned long __force *)(x))
 
 #define VERIFY_READ	0
diff --git a/arch/m68k/include/asm/uaccess_no.h b/arch/m68k/include/asm/uaccess_no.h
index 1bdf15263754..36deeb36503b 100644
--- a/arch/m68k/include/asm/uaccess_no.h
+++ b/arch/m68k/include/asm/uaccess_no.h
@@ -44,9 +44,6 @@ struct exception_table_entry
 	unsigned long insn, fixup;
 };
 
-/* Returns 0 if exception not found and fixup otherwise.  */
-extern unsigned long search_exception_table(unsigned long);
-
 
 /*
  * These are the main single-value transfer routines.  They automatically
diff --git a/arch/microblaze/include/asm/uaccess.h b/arch/microblaze/include/asm/uaccess.h
index 826676778094..253a67e275ad 100644
--- a/arch/microblaze/include/asm/uaccess.h
+++ b/arch/microblaze/include/asm/uaccess.h
@@ -71,9 +71,6 @@ struct exception_table_entry {
 	unsigned long insn, fixup;
 };
 
-/* Returns 0 if exception not found and fixup otherwise.  */
-extern unsigned long search_exception_table(unsigned long);
-
 #ifndef CONFIG_MMU
 
 /* Check against bounds of physical memory */
diff --git a/arch/mips/include/asm/extable.h b/arch/mips/include/asm/extable.h
new file mode 100644
index 000000000000..dce7a627a925
--- /dev/null
+++ b/arch/mips/include/asm/extable.h
@@ -0,0 +1,13 @@
+#ifndef _ASM_EXTABLE_H
+#define _ASM_EXTABLE_H
+
+struct exception_table_entry
+{
+	unsigned long insn;
+	unsigned long nextinsn;
+};
+
+struct pt_regs;
+extern int fixup_exception(struct pt_regs *regs);
+
+#endif
diff --git a/arch/mips/include/asm/module.h b/arch/mips/include/asm/module.h
index 0aaf9a01ea50..702c273e67a9 100644
--- a/arch/mips/include/asm/module.h
+++ b/arch/mips/include/asm/module.h
@@ -3,7 +3,7 @@
 
 #include <linux/list.h>
 #include <linux/elf.h>
-#include <asm/uaccess.h>
+#include <asm/extable.h>
 
 struct mod_arch_specific {
 	/* Data Bus Error exception tables */
diff --git a/arch/mips/include/asm/uaccess.h b/arch/mips/include/asm/uaccess.h
index 21a2aaba20d5..4daf839cd8a8 100644
--- a/arch/mips/include/asm/uaccess.h
+++ b/arch/mips/include/asm/uaccess.h
@@ -16,6 +16,7 @@
 #include <linux/thread_info.h>
 #include <linux/string.h>
 #include <asm/asm-eva.h>
+#include <asm/extable.h>
 
 /*
  * The fs value determines whether argument validity checking should be
@@ -1485,12 +1486,4 @@ static inline long strnlen_user(const char __user *s, long n)
 	return res;
 }
 
-struct exception_table_entry
-{
-	unsigned long insn;
-	unsigned long nextinsn;
-};
-
-extern int fixup_exception(struct pt_regs *regs);
-
 #endif /* _ASM_UACCESS_H */
diff --git a/arch/mips/lasat/picvue_proc.c b/arch/mips/lasat/picvue_proc.c
index 27533c109f92..dd292dcec684 100644
--- a/arch/mips/lasat/picvue_proc.c
+++ b/arch/mips/lasat/picvue_proc.c
@@ -16,6 +16,7 @@
 
 #include <linux/timer.h>
 #include <linux/mutex.h>
+#include <linux/uaccess.h>
 
 #include "picvue.h"
 
diff --git a/arch/mn10300/include/asm/processor.h b/arch/mn10300/include/asm/processor.h
index 769d5ed8e992..b10ba121c849 100644
--- a/arch/mn10300/include/asm/processor.h
+++ b/arch/mn10300/include/asm/processor.h
@@ -18,7 +18,6 @@
 #include <asm/page.h>
 #include <asm/ptrace.h>
 #include <asm/cpu-regs.h>
-#include <asm/uaccess.h>
 #include <asm/current.h>
 
 /* Forward declaration, a strange C thing */
diff --git a/arch/mn10300/include/asm/uaccess.h b/arch/mn10300/include/asm/uaccess.h
index d012e877a95a..2eedf6f46a57 100644
--- a/arch/mn10300/include/asm/uaccess.h
+++ b/arch/mn10300/include/asm/uaccess.h
@@ -38,7 +38,6 @@
 #define get_ds()	(KERNEL_DS)
 #define get_fs()	(current_thread_info()->addr_limit)
 #define set_fs(x)	(current_thread_info()->addr_limit = (x))
-#define __kernel_ds_p() (current_thread_info()->addr_limit.seg == 0x9FFFFFFF)
 
 #define segment_eq(a, b) ((a).seg == (b).seg)
 
@@ -72,12 +71,6 @@ static inline int ___range_ok(unsigned long addr, unsigned int size)
 #define access_ok(type, addr, size) (__range_ok((addr), (size)) == 0)
 #define __access_ok(addr, size)     (__range_ok((addr), (size)) == 0)
 
-static inline int verify_area(int type, const void *addr, unsigned long size)
-{
-	return access_ok(type, addr, size) ? 0 : -EFAULT;
-}
-
-
 /*
  * The exception table consists of pairs of addresses: the first is the
  * address of an instruction that is allowed to fault, and the second is
diff --git a/arch/mn10300/kernel/signal.c b/arch/mn10300/kernel/signal.c
index dfd0301cf200..cd8cb1d1176b 100644
--- a/arch/mn10300/kernel/signal.c
+++ b/arch/mn10300/kernel/signal.c
@@ -75,7 +75,7 @@ static int restore_sigcontext(struct pt_regs *regs,
 		struct fpucontext *buf;
 		err |= __get_user(buf, &sc->fpucontext);
 		if (buf) {
-			if (verify_area(VERIFY_READ, buf, sizeof(*buf)))
+			if (!access_ok(VERIFY_READ, buf, sizeof(*buf)))
 				goto badframe;
 			err |= fpu_restore_sigcontext(buf);
 		}
@@ -98,7 +98,7 @@ asmlinkage long sys_sigreturn(void)
 	long d0;
 
 	frame = (struct sigframe __user *) current_frame()->sp;
-	if (verify_area(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
 	if (__get_user(set.sig[0], &frame->sc.oldmask))
 		goto badframe;
@@ -130,7 +130,7 @@ asmlinkage long sys_rt_sigreturn(void)
 	long d0;
 
 	frame = (struct rt_sigframe __user *) current_frame()->sp;
-	if (verify_area(VERIFY_READ, frame, sizeof(*frame)))
+	if (!access_ok(VERIFY_READ, frame, sizeof(*frame)))
 		goto badframe;
 	if (__copy_from_user(&set, &frame->uc.uc_sigmask, sizeof(set)))
 		goto badframe;
diff --git a/arch/openrisc/include/asm/uaccess.h b/arch/openrisc/include/asm/uaccess.h
index 5cc6b4f1b795..140faa16685a 100644
--- a/arch/openrisc/include/asm/uaccess.h
+++ b/arch/openrisc/include/asm/uaccess.h
@@ -82,10 +82,6 @@ struct exception_table_entry {
 	unsigned long insn, fixup;
 };
 
-/* Returns 0 if exception not found and fixup otherwise.  */
-extern unsigned long search_exception_table(unsigned long);
-extern void sort_exception_table(void);
-
 /*
  * These are the main single-value transfer routines.  They automatically
  * use the right size if we just have the right pointer type.
diff --git a/arch/parisc/include/asm/pgtable.h b/arch/parisc/include/asm/pgtable.h
index e44bdb9078a5..c2c43f714684 100644
--- a/arch/parisc/include/asm/pgtable.h
+++ b/arch/parisc/include/asm/pgtable.h
@@ -83,10 +83,10 @@ static inline void purge_tlb_entries(struct mm_struct *mm, unsigned long addr)
 	printk("%s:%d: bad pgd %08lx.\n", __FILE__, __LINE__, (unsigned long)pgd_val(e))
 
 /* This is the size of the initially mapped kernel memory */
-#if defined(CONFIG_64BIT) || defined(CONFIG_SMP)
-#define KERNEL_INITIAL_ORDER	25	/* 1<<25 = 32MB */
+#if defined(CONFIG_64BIT)
+#define KERNEL_INITIAL_ORDER	26	/* 1<<26 = 64MB */
 #else
-#define KERNEL_INITIAL_ORDER	24	/* 1<<24 = 16MB */
+#define KERNEL_INITIAL_ORDER	25	/* 1<<25 = 32MB */
 #endif
 #define KERNEL_INITIAL_SIZE	(1 << KERNEL_INITIAL_ORDER)
 
diff --git a/arch/parisc/include/asm/traps.h b/arch/parisc/include/asm/traps.h
index 5e953ab4530d..63670231f48a 100644
--- a/arch/parisc/include/asm/traps.h
+++ b/arch/parisc/include/asm/traps.h
@@ -11,6 +11,7 @@ void parisc_terminate(char *msg, struct pt_regs *regs,
 void die_if_kernel(char *str, struct pt_regs *regs, long err);
 
 /* mm/fault.c */
+const char *trap_name(unsigned long code);
 void do_page_fault(struct pt_regs *regs, unsigned long code,
 		unsigned long address);
 #endif
diff --git a/arch/parisc/kernel/traps.c b/arch/parisc/kernel/traps.c
index 97d6b208e129..378df9207406 100644
--- a/arch/parisc/kernel/traps.c
+++ b/arch/parisc/kernel/traps.c
@@ -458,8 +458,8 @@ void parisc_terminate(char *msg, struct pt_regs *regs, int code, unsigned long o
 	}
 
 	printk("\n");
-	printk(KERN_CRIT "%s: Code=%d regs=%p (Addr=" RFMT ")\n",
-			msg, code, regs, offset);
+	pr_crit("%s: Code=%d (%s) regs=%p (Addr=" RFMT ")\n",
+		msg, code, trap_name(code), regs, offset);
 	show_regs(regs);
 
 	spin_unlock(&terminate_lock);
diff --git a/arch/parisc/kernel/vmlinux.lds.S b/arch/parisc/kernel/vmlinux.lds.S
index b37787dbe775..3d6ef1b29c6a 100644
--- a/arch/parisc/kernel/vmlinux.lds.S
+++ b/arch/parisc/kernel/vmlinux.lds.S
@@ -90,8 +90,9 @@ SECTIONS
 	/* Start of data section */
 	_sdata = .;
 
-	RO_DATA_SECTION(8)
-
+	/* Architecturally we need to keep __gp below 0x1000000 and thus
+	 * in front of RO_DATA_SECTION() which stores lots of tracepoint
+	 * and ftrace symbols. */
 #ifdef CONFIG_64BIT
 	. = ALIGN(16);
 	/* Linkage tables */
@@ -106,6 +107,12 @@ SECTIONS
 	}
 #endif
 
+	RO_DATA_SECTION(8)
+
+	/* RO because of BUILDTIME_EXTABLE_SORT */
+	EXCEPTION_TABLE(8)
+	NOTES
+
 	/* unwind info */
 	.PARISC.unwind : {
 		__start___unwind = .;
@@ -121,9 +128,6 @@ SECTIONS
 	. = ALIGN(HUGEPAGE_SIZE);
 	data_start = .;
 
-	EXCEPTION_TABLE(8)
-	NOTES
-
 	/* Data */
 	RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, PAGE_SIZE)
 
diff --git a/arch/parisc/mm/fault.c b/arch/parisc/mm/fault.c
index 47a6ca4c9e40..8ff9253930af 100644
--- a/arch/parisc/mm/fault.c
+++ b/arch/parisc/mm/fault.c
@@ -14,7 +14,7 @@
 #include <linux/ptrace.h>
 #include <linux/sched.h>
 #include <linux/interrupt.h>
-#include <linux/module.h>
+#include <linux/extable.h>
 #include <linux/uaccess.h>
 
 #include <asm/traps.h>
@@ -204,6 +204,16 @@ static const char * const trap_description[] = {
 	[28] "Unaligned data reference trap",
 };
 
+const char *trap_name(unsigned long code)
+{
+	const char *t = NULL;
+
+	if (code < ARRAY_SIZE(trap_description))
+		t = trap_description[code];
+
+	return t ? t : "Unknown trap";
+}
+
 /*
  * Print out info about fatal segfaults, if the show_unhandled_signals
  * sysctl is set:
@@ -213,8 +223,6 @@ show_signal_msg(struct pt_regs *regs, unsigned long code,
 		unsigned long address, struct task_struct *tsk,
 		struct vm_area_struct *vma)
 {
-	const char *trap_name = NULL;
-
 	if (!unhandled_signal(tsk, SIGSEGV))
 		return;
 
@@ -226,10 +234,7 @@ show_signal_msg(struct pt_regs *regs, unsigned long code,
 	    tsk->comm, code, address);
 	print_vma_addr(KERN_CONT " in ", regs->iaoq[0]);
 
-	if (code < ARRAY_SIZE(trap_description))
-		trap_name = trap_description[code];
-	pr_warn(KERN_CONT " trap #%lu: %s%c", code,
-		trap_name ? trap_name : "unknown",
+	pr_cont(" trap #%lu: %s%c", code, trap_name(code),
 		vma ? ',':'\n');
 
 	if (vma)
diff --git a/arch/parisc/mm/init.c b/arch/parisc/mm/init.c
index 356f38473b5d..e02ada312be8 100644
--- a/arch/parisc/mm/init.c
+++ b/arch/parisc/mm/init.c
@@ -105,6 +105,8 @@ static void * __init get_memblock(unsigned long size)
 	else
 		panic("get_memblock() failed.\n");
 
+	memset(__va(phys), 0, size);
+
 	return __va(phys);
 }
 
diff --git a/arch/score/include/asm/extable.h b/arch/score/include/asm/extable.h
new file mode 100644
index 000000000000..c4423ccf830d
--- /dev/null
+++ b/arch/score/include/asm/extable.h
@@ -0,0 +1,11 @@
+#ifndef _ASM_SCORE_EXTABLE_H
+#define _ASM_SCORE_EXTABLE_H
+
+struct exception_table_entry {
+	unsigned long insn;
+	unsigned long fixup;
+};
+
+struct pt_regs;
+extern int fixup_exception(struct pt_regs *regs);
+#endif
diff --git a/arch/score/include/asm/module.h b/arch/score/include/asm/module.h
index abf395bbfaba..6dc1f2935eef 100644
--- a/arch/score/include/asm/module.h
+++ b/arch/score/include/asm/module.h
@@ -2,7 +2,7 @@
 #define _ASM_SCORE_MODULE_H
 
 #include <linux/list.h>
-#include <asm/uaccess.h>
+#include <asm/extable.h>
 #include <asm-generic/module.h>
 
 struct mod_arch_specific {
diff --git a/arch/score/include/asm/uaccess.h b/arch/score/include/asm/uaccess.h
index 01aec8ccde83..db58ab98ec4b 100644
--- a/arch/score/include/asm/uaccess.h
+++ b/arch/score/include/asm/uaccess.h
@@ -4,6 +4,7 @@
 #include <linux/kernel.h>
 #include <linux/errno.h>
 #include <linux/thread_info.h>
+#include <asm/extable.h>
 
 #define VERIFY_READ		0
 #define VERIFY_WRITE		1
@@ -420,12 +421,5 @@ static inline long strnlen_user(const char __user *str, long len)
 		return __strnlen_user(str, len);
 }
 
-struct exception_table_entry {
-	unsigned long insn;
-	unsigned long fixup;
-};
-
-extern int fixup_exception(struct pt_regs *regs);
-
 #endif /* __SCORE_UACCESS_H */
 
diff --git a/arch/sh/include/asm/uaccess.h b/arch/sh/include/asm/uaccess.h
index 92ade79ac427..a38d0c7b818f 100644
--- a/arch/sh/include/asm/uaccess.h
+++ b/arch/sh/include/asm/uaccess.h
@@ -192,8 +192,6 @@ struct exception_table_entry {
 #endif
 
 int fixup_exception(struct pt_regs *regs);
-/* Returns 0 if exception not found and fixup.unit otherwise.  */
-unsigned long search_exception_table(unsigned long addr);
 const struct exception_table_entry *search_exception_tables(unsigned long addr);
 
 extern void *set_exception_table_vec(unsigned int vec, void *handler);
diff --git a/arch/sparc/include/asm/elf_64.h b/arch/sparc/include/asm/elf_64.h
index 93310837c2df..3f2d403873bd 100644
--- a/arch/sparc/include/asm/elf_64.h
+++ b/arch/sparc/include/asm/elf_64.h
@@ -7,7 +7,7 @@
 
 #include <asm/ptrace.h>
 #include <asm/processor.h>
-#include <asm/uaccess.h>
+#include <asm/extable_64.h>
 #include <asm/spitfire.h>
 
 /*
diff --git a/arch/sparc/include/asm/extable_64.h b/arch/sparc/include/asm/extable_64.h
new file mode 100644
index 000000000000..1121cb056ffb
--- /dev/null
+++ b/arch/sparc/include/asm/extable_64.h
@@ -0,0 +1,20 @@
+#ifndef __ASM_EXTABLE64_H
+#define __ASM_EXTABLE64_H
+/*
+ * The exception table consists of pairs of addresses: the first is the
+ * address of an instruction that is allowed to fault, and the second is
+ * the address at which the program should continue.  No registers are
+ * modified, so it is entirely up to the continuation code to figure out
+ * what to do.
+ *
+ * All the routines below use bits of fixup code that are out of line
+ * with the main instruction path.  This means when everything is well,
+ * we don't even have to jump over them.  Further, they do not intrude
+ * on our cache or tlb entries.
+ */
+
+struct exception_table_entry {
+        unsigned int insn, fixup;
+};
+
+#endif
diff --git a/arch/sparc/include/asm/uaccess_64.h b/arch/sparc/include/asm/uaccess_64.h
index 37a315d0ddd4..b68acc563235 100644
--- a/arch/sparc/include/asm/uaccess_64.h
+++ b/arch/sparc/include/asm/uaccess_64.h
@@ -13,6 +13,7 @@
 #include <asm/asi.h>
 #include <asm/spitfire.h>
 #include <asm-generic/uaccess-unaligned.h>
+#include <asm/extable_64.h>
 #endif
 
 #ifndef __ASSEMBLY__
@@ -81,23 +82,6 @@ static inline int access_ok(int type, const void __user * addr, unsigned long si
 	return 1;
 }
 
-/*
- * The exception table consists of pairs of addresses: the first is the
- * address of an instruction that is allowed to fault, and the second is
- * the address at which the program should continue.  No registers are
- * modified, so it is entirely up to the continuation code to figure out
- * what to do.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path.  This means when everything is well,
- * we don't even have to jump over them.  Further, they do not intrude
- * on our cache or tlb entries.
- */
-
-struct exception_table_entry {
-        unsigned int insn, fixup;
-};
-
 void __ret_efault(void);
 void __retl_efault(void);
 
diff --git a/arch/x86/include/asm/cacheflush.h b/arch/x86/include/asm/cacheflush.h
index 61518cf79437..872877d930de 100644
--- a/arch/x86/include/asm/cacheflush.h
+++ b/arch/x86/include/asm/cacheflush.h
@@ -4,7 +4,6 @@
 /* Caches aren't brain-dead on the intel. */
 #include <asm-generic/cacheflush.h>
 #include <asm/special_insns.h>
-#include <asm/uaccess.h>
 
 /*
  * The set_memory_* API can be used to change various attributes of a virtual
diff --git a/arch/x86/include/asm/extable.h b/arch/x86/include/asm/extable.h
new file mode 100644
index 000000000000..b8ad261d11dc
--- /dev/null
+++ b/arch/x86/include/asm/extable.h
@@ -0,0 +1,35 @@
+#ifndef _ASM_X86_EXTABLE_H
+#define _ASM_X86_EXTABLE_H
+/*
+ * The exception table consists of triples of addresses relative to the
+ * exception table entry itself. The first address is of an instruction
+ * that is allowed to fault, the second is the target at which the program
+ * should continue. The third is a handler function to deal with the fault
+ * caused by the instruction in the first field.
+ *
+ * All the routines below use bits of fixup code that are out of line
+ * with the main instruction path.  This means when everything is well,
+ * we don't even have to jump over them.  Further, they do not intrude
+ * on our cache or tlb entries.
+ */
+
+struct exception_table_entry {
+	int insn, fixup, handler;
+};
+struct pt_regs;
+
+#define ARCH_HAS_RELATIVE_EXTABLE
+
+#define swap_ex_entry_fixup(a, b, tmp, delta)			\
+	do {							\
+		(a)->fixup = (b)->fixup + (delta);		\
+		(b)->fixup = (tmp).fixup - (delta);		\
+		(a)->handler = (b)->handler + (delta);		\
+		(b)->handler = (tmp).handler - (delta);		\
+	} while (0)
+
+extern int fixup_exception(struct pt_regs *regs, int trapnr);
+extern bool ex_has_fault_handler(unsigned long ip);
+extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
+
+#endif
diff --git a/arch/x86/include/asm/sections.h b/arch/x86/include/asm/sections.h
index 13b6cdd0af57..2f75f30cb2f6 100644
--- a/arch/x86/include/asm/sections.h
+++ b/arch/x86/include/asm/sections.h
@@ -2,7 +2,7 @@
 #define _ASM_X86_SECTIONS_H
 
 #include <asm-generic/sections.h>
-#include <asm/uaccess.h>
+#include <asm/extable.h>
 
 extern char __brk_base[], __brk_limit[];
 extern struct exception_table_entry __stop___ex_table[];
diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 2131c4ce7d8a..faf3687f1035 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -11,6 +11,7 @@
 #include <asm/asm.h>
 #include <asm/page.h>
 #include <asm/smap.h>
+#include <asm/extable.h>
 
 #define VERIFY_READ 0
 #define VERIFY_WRITE 1
@@ -91,37 +92,6 @@ static inline bool __chk_range_not_ok(unsigned long addr, unsigned long size, un
 	likely(!__range_not_ok(addr, size, user_addr_max()))
 
 /*
- * The exception table consists of triples of addresses relative to the
- * exception table entry itself. The first address is of an instruction
- * that is allowed to fault, the second is the target at which the program
- * should continue. The third is a handler function to deal with the fault
- * caused by the instruction in the first field.
- *
- * All the routines below use bits of fixup code that are out of line
- * with the main instruction path.  This means when everything is well,
- * we don't even have to jump over them.  Further, they do not intrude
- * on our cache or tlb entries.
- */
-
-struct exception_table_entry {
-	int insn, fixup, handler;
-};
-
-#define ARCH_HAS_RELATIVE_EXTABLE
-
-#define swap_ex_entry_fixup(a, b, tmp, delta)			\
-	do {							\
-		(a)->fixup = (b)->fixup + (delta);		\
-		(b)->fixup = (tmp).fixup - (delta);		\
-		(a)->handler = (b)->handler + (delta);		\
-		(b)->handler = (tmp).handler - (delta);		\
-	} while (0)
-
-extern int fixup_exception(struct pt_regs *regs, int trapnr);
-extern bool ex_has_fault_handler(unsigned long ip);
-extern void early_fixup_exception(struct pt_regs *regs, int trapnr);
-
-/*
  * These are the main single-value transfer routines.  They automatically
  * use the right size if we just have the right pointer type.
  *
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index 4dc13340653e..9f72ca3b2669 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -5,7 +5,7 @@
  */
 #include <linux/sched.h>		/* test_thread_flag(), ...	*/
 #include <linux/kdebug.h>		/* oops_begin/end, ...		*/
-#include <linux/extable.h>		/* search_exception_table	*/
+#include <linux/extable.h>		/* search_exception_tables	*/
 #include <linux/bootmem.h>		/* max_low_pfn			*/
 #include <linux/kprobes.h>		/* NOKPROBE_SYMBOL, ...		*/
 #include <linux/mmiotrace.h>		/* kmmio_handler, ...		*/
diff --git a/arch/xtensa/include/asm/asm-uaccess.h b/arch/xtensa/include/asm/asm-uaccess.h
new file mode 100644
index 000000000000..a7a110039786
--- /dev/null
+++ b/arch/xtensa/include/asm/asm-uaccess.h
@@ -0,0 +1,160 @@
+/*
+ * include/asm-xtensa/uaccess.h
+ *
+ * User space memory access functions
+ *
+ * These routines provide basic accessing functions to the user memory
+ * space for the kernel. This header file provides functions such as:
+ *
+ * This file is subject to the terms and conditions of the GNU General Public
+ * License.  See the file "COPYING" in the main directory of this archive
+ * for more details.
+ *
+ * Copyright (C) 2001 - 2005 Tensilica Inc.
+ */
+
+#ifndef _XTENSA_ASM_UACCESS_H
+#define _XTENSA_ASM_UACCESS_H
+
+#include <linux/errno.h>
+#include <asm/types.h>
+
+#define VERIFY_READ    0
+#define VERIFY_WRITE   1
+
+#include <asm/current.h>
+#include <asm/asm-offsets.h>
+#include <asm/processor.h>
+
+/*
+ * These assembly macros mirror the C macros in asm/uaccess.h.  They
+ * should always have identical functionality.  See
+ * arch/xtensa/kernel/sys.S for usage.
+ */
+
+#define KERNEL_DS	0
+#define USER_DS		1
+
+#define get_ds		(KERNEL_DS)
+
+/*
+ * get_fs reads current->thread.current_ds into a register.
+ * On Entry:
+ * 	<ad>	anything
+ * 	<sp>	stack
+ * On Exit:
+ * 	<ad>	contains current->thread.current_ds
+ */
+	.macro	get_fs	ad, sp
+	GET_CURRENT(\ad,\sp)
+#if THREAD_CURRENT_DS > 1020
+	addi	\ad, \ad, TASK_THREAD
+	l32i	\ad, \ad, THREAD_CURRENT_DS - TASK_THREAD
+#else
+	l32i	\ad, \ad, THREAD_CURRENT_DS
+#endif
+	.endm
+
+/*
+ * set_fs sets current->thread.current_ds to some value.
+ * On Entry:
+ *	<at>	anything (temp register)
+ *	<av>	value to write
+ *	<sp>	stack
+ * On Exit:
+ *	<at>	destroyed (actually, current)
+ *	<av>	preserved, value to write
+ */
+	.macro	set_fs	at, av, sp
+	GET_CURRENT(\at,\sp)
+	s32i	\av, \at, THREAD_CURRENT_DS
+	.endm
+
+/*
+ * kernel_ok determines whether we should bypass addr/size checking.
+ * See the equivalent C-macro version below for clarity.
+ * On success, kernel_ok branches to a label indicated by parameter
+ * <success>.  This implies that the macro falls through to the next
+ * insruction on an error.
+ *
+ * Note that while this macro can be used independently, we designed
+ * in for optimal use in the access_ok macro below (i.e., we fall
+ * through on error).
+ *
+ * On Entry:
+ * 	<at>		anything (temp register)
+ * 	<success>	label to branch to on success; implies
+ * 			fall-through macro on error
+ * 	<sp>		stack pointer
+ * On Exit:
+ * 	<at>		destroyed (actually, current->thread.current_ds)
+ */
+
+#if ((KERNEL_DS != 0) || (USER_DS == 0))
+# error Assembly macro kernel_ok fails
+#endif
+	.macro	kernel_ok  at, sp, success
+	get_fs	\at, \sp
+	beqz	\at, \success
+	.endm
+
+/*
+ * user_ok determines whether the access to user-space memory is allowed.
+ * See the equivalent C-macro version below for clarity.
+ *
+ * On error, user_ok branches to a label indicated by parameter
+ * <error>.  This implies that the macro falls through to the next
+ * instruction on success.
+ *
+ * Note that while this macro can be used independently, we designed
+ * in for optimal use in the access_ok macro below (i.e., we fall
+ * through on success).
+ *
+ * On Entry:
+ * 	<aa>	register containing memory address
+ * 	<as>	register containing memory size
+ * 	<at>	temp register
+ * 	<error>	label to branch to on error; implies fall-through
+ * 		macro on success
+ * On Exit:
+ * 	<aa>	preserved
+ * 	<as>	preserved
+ * 	<at>	destroyed (actually, (TASK_SIZE + 1 - size))
+ */
+	.macro	user_ok	aa, as, at, error
+	movi	\at, __XTENSA_UL_CONST(TASK_SIZE)
+	bgeu	\as, \at, \error
+	sub	\at, \at, \as
+	bgeu	\aa, \at, \error
+	.endm
+
+/*
+ * access_ok determines whether a memory access is allowed.  See the
+ * equivalent C-macro version below for clarity.
+ *
+ * On error, access_ok branches to a label indicated by parameter
+ * <error>.  This implies that the macro falls through to the next
+ * instruction on success.
+ *
+ * Note that we assume success is the common case, and we optimize the
+ * branch fall-through case on success.
+ *
+ * On Entry:
+ * 	<aa>	register containing memory address
+ * 	<as>	register containing memory size
+ * 	<at>	temp register
+ * 	<sp>
+ * 	<error>	label to branch to on error; implies fall-through
+ * 		macro on success
+ * On Exit:
+ * 	<aa>	preserved
+ * 	<as>	preserved
+ * 	<at>	destroyed
+ */
+	.macro	access_ok  aa, as, at, sp, error
+	kernel_ok  \at, \sp, .Laccess_ok_\@
+	user_ok    \aa, \as, \at, \error
+.Laccess_ok_\@:
+	.endm
+
+#endif	/* _XTENSA_ASM_UACCESS_H */
diff --git a/arch/xtensa/include/asm/uaccess.h b/arch/xtensa/include/asm/uaccess.h
index 147b26ed9c91..848a3d736bcb 100644
--- a/arch/xtensa/include/asm/uaccess.h
+++ b/arch/xtensa/include/asm/uaccess.h
@@ -17,153 +17,12 @@
 #define _XTENSA_UACCESS_H
 
 #include <linux/errno.h>
-#ifndef __ASSEMBLY__
 #include <linux/prefetch.h>
-#endif
 #include <asm/types.h>
 
 #define VERIFY_READ    0
 #define VERIFY_WRITE   1
 
-#ifdef __ASSEMBLY__
-
-#include <asm/current.h>
-#include <asm/asm-offsets.h>
-#include <asm/processor.h>
-
-/*
- * These assembly macros mirror the C macros that follow below.  They
- * should always have identical functionality.  See
- * arch/xtensa/kernel/sys.S for usage.
- */
-
-#define KERNEL_DS	0
-#define USER_DS		1
-
-#define get_ds		(KERNEL_DS)
-
-/*
- * get_fs reads current->thread.current_ds into a register.
- * On Entry:
- * 	<ad>	anything
- * 	<sp>	stack
- * On Exit:
- * 	<ad>	contains current->thread.current_ds
- */
-	.macro	get_fs	ad, sp
-	GET_CURRENT(\ad,\sp)
-#if THREAD_CURRENT_DS > 1020
-	addi	\ad, \ad, TASK_THREAD
-	l32i	\ad, \ad, THREAD_CURRENT_DS - TASK_THREAD
-#else
-	l32i	\ad, \ad, THREAD_CURRENT_DS
-#endif
-	.endm
-
-/*
- * set_fs sets current->thread.current_ds to some value.
- * On Entry:
- *	<at>	anything (temp register)
- *	<av>	value to write
- *	<sp>	stack
- * On Exit:
- *	<at>	destroyed (actually, current)
- *	<av>	preserved, value to write
- */
-	.macro	set_fs	at, av, sp
-	GET_CURRENT(\at,\sp)
-	s32i	\av, \at, THREAD_CURRENT_DS
-	.endm
-
-/*
- * kernel_ok determines whether we should bypass addr/size checking.
- * See the equivalent C-macro version below for clarity.
- * On success, kernel_ok branches to a label indicated by parameter
- * <success>.  This implies that the macro falls through to the next
- * insruction on an error.
- *
- * Note that while this macro can be used independently, we designed
- * in for optimal use in the access_ok macro below (i.e., we fall
- * through on error).
- *
- * On Entry:
- * 	<at>		anything (temp register)
- * 	<success>	label to branch to on success; implies
- * 			fall-through macro on error
- * 	<sp>		stack pointer
- * On Exit:
- * 	<at>		destroyed (actually, current->thread.current_ds)
- */
-
-#if ((KERNEL_DS != 0) || (USER_DS == 0))
-# error Assembly macro kernel_ok fails
-#endif
-	.macro	kernel_ok  at, sp, success
-	get_fs	\at, \sp
-	beqz	\at, \success
-	.endm
-
-/*
- * user_ok determines whether the access to user-space memory is allowed.
- * See the equivalent C-macro version below for clarity.
- *
- * On error, user_ok branches to a label indicated by parameter
- * <error>.  This implies that the macro falls through to the next
- * instruction on success.
- *
- * Note that while this macro can be used independently, we designed
- * in for optimal use in the access_ok macro below (i.e., we fall
- * through on success).
- *
- * On Entry:
- * 	<aa>	register containing memory address
- * 	<as>	register containing memory size
- * 	<at>	temp register
- * 	<error>	label to branch to on error; implies fall-through
- * 		macro on success
- * On Exit:
- * 	<aa>	preserved
- * 	<as>	preserved
- * 	<at>	destroyed (actually, (TASK_SIZE + 1 - size))
- */
-	.macro	user_ok	aa, as, at, error
-	movi	\at, __XTENSA_UL_CONST(TASK_SIZE)
-	bgeu	\as, \at, \error
-	sub	\at, \at, \as
-	bgeu	\aa, \at, \error
-	.endm
-
-/*
- * access_ok determines whether a memory access is allowed.  See the
- * equivalent C-macro version below for clarity.
- *
- * On error, access_ok branches to a label indicated by parameter
- * <error>.  This implies that the macro falls through to the next
- * instruction on success.
- *
- * Note that we assume success is the common case, and we optimize the
- * branch fall-through case on success.
- *
- * On Entry:
- * 	<aa>	register containing memory address
- * 	<as>	register containing memory size
- * 	<at>	temp register
- * 	<sp>
- * 	<error>	label to branch to on error; implies fall-through
- * 		macro on success
- * On Exit:
- * 	<aa>	preserved
- * 	<as>	preserved
- * 	<at>	destroyed
- */
-	.macro	access_ok  aa, as, at, sp, error
-	kernel_ok  \at, \sp, .Laccess_ok_\@
-	user_ok    \aa, \as, \at, \error
-.Laccess_ok_\@:
-	.endm
-
-#else /* __ASSEMBLY__ not defined */
-
 #include <linux/sched.h>
 
 /*
@@ -495,16 +354,4 @@ struct exception_table_entry
 	unsigned long insn, fixup;
 };
 
-/* Returns 0 if exception not found and fixup.unit otherwise.  */
-
-extern unsigned long search_exception_table(unsigned long addr);
-extern void sort_exception_table(void);
-
-/* Returns the new pc */
-#define fixup_exception(map_reg, fixup_unit, pc)                \
-({                                                              \
-	fixup_unit;                                             \
-})
-
-#endif	/* __ASSEMBLY__ */
 #endif	/* _XTENSA_UACCESS_H */
diff --git a/arch/xtensa/kernel/coprocessor.S b/arch/xtensa/kernel/coprocessor.S
index a482df5df2b2..6911e384f608 100644
--- a/arch/xtensa/kernel/coprocessor.S
+++ b/arch/xtensa/kernel/coprocessor.S
@@ -17,7 +17,7 @@
 #include <asm/processor.h>
 #include <asm/coprocessor.h>
 #include <asm/thread_info.h>
-#include <asm/uaccess.h>
+#include <asm/asm-uaccess.h>
 #include <asm/unistd.h>
 #include <asm/ptrace.h>
 #include <asm/current.h>
diff --git a/arch/xtensa/kernel/entry.S b/arch/xtensa/kernel/entry.S
index fa04d9d368a7..f5ef3cc0497c 100644
--- a/arch/xtensa/kernel/entry.S
+++ b/arch/xtensa/kernel/entry.S
@@ -17,7 +17,7 @@
 #include <asm/processor.h>
 #include <asm/coprocessor.h>
 #include <asm/thread_info.h>
-#include <asm/uaccess.h>
+#include <asm/asm-uaccess.h>
 #include <asm/unistd.h>
 #include <asm/ptrace.h>
 #include <asm/current.h>
diff --git a/drivers/acpi/thermal.c b/drivers/acpi/thermal.c
index f4ebe39539af..35e8fbca10ad 100644
--- a/drivers/acpi/thermal.c
+++ b/drivers/acpi/thermal.c
@@ -520,7 +520,8 @@ static void acpi_thermal_check(void *data)
 	if (!tz->tz_enabled)
 		return;
 
-	thermal_zone_device_update(tz->thermal_zone);
+	thermal_zone_device_update(tz->thermal_zone,
+				   THERMAL_EVENT_UNSPECIFIED);
 }
 
 /* sys I/F for generic thermal sysfs support */
diff --git a/drivers/char/tb0219.c b/drivers/char/tb0219.c
index 480a777db577..7c19d9b22785 100644
--- a/drivers/char/tb0219.c
+++ b/drivers/char/tb0219.c
@@ -21,6 +21,7 @@
 #include <linux/fs.h>
 #include <linux/init.h>
 #include <linux/module.h>
+#include <linux/uaccess.h>
 
 #include <asm/io.h>
 #include <asm/reboot.h>
diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 3f31ca32f52b..5fa36ebc0640 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -471,9 +471,9 @@ static int bond_check_dev_link(struct bonding *bond,
 		/* Yes, the mii is overlaid on the ifreq.ifr_ifru */
 		strncpy(ifr.ifr_name, slave_dev->name, IFNAMSIZ);
 		mii = if_mii(&ifr);
-		if (IOCTL(slave_dev, &ifr, SIOCGMIIPHY) == 0) {
+		if (ioctl(slave_dev, &ifr, SIOCGMIIPHY) == 0) {
 			mii->reg_num = MII_BMSR;
-			if (IOCTL(slave_dev, &ifr, SIOCGMIIREG) == 0)
+			if (ioctl(slave_dev, &ifr, SIOCGMIIREG) == 0)
 				return mii->val_out & BMSR_LSTATUS;
 		}
 	}
diff --git a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
index bddb198c0b74..380a64115a98 100644
--- a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
+++ b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.c
@@ -693,7 +693,7 @@ static int cn23xx_enable_io_queues(struct octeon_device *oct)
 				while ((reg_val & CN23XX_PKT_INPUT_CTL_RST) &&
 				       !(reg_val &
 					 CN23XX_PKT_INPUT_CTL_QUIET) &&
-				       loop--) {
+				       --loop) {
 					reg_val = octeon_read_csr64(
 					    oct,
 					    CN23XX_SLI_IQ_PKT_CONTROL64(q_no));
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
index d4585154151d..cc4fd61914d3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/pagealloc.c
@@ -287,7 +287,7 @@ retry:
 
 			goto retry;
 		}
-		MLX5_SET64(manage_pages_in, in, pas[i], addr);
+		MLX5_ARRAY_SET64(manage_pages_in, in, pas, i, addr);
 	}
 
 	MLX5_SET(manage_pages_in, in, opcode, MLX5_CMD_OP_MANAGE_PAGES);
@@ -344,7 +344,7 @@ static int reclaim_pages_cmd(struct mlx5_core_dev *dev,
 		if (fwp->func_id != func_id)
 			continue;
 
-		MLX5_SET64(manage_pages_out, out, pas[i], fwp->addr);
+		MLX5_ARRAY_SET64(manage_pages_out, out, pas, i, fwp->addr);
 		i++;
 	}
 
diff --git a/drivers/net/ethernet/qlogic/qed/qed_ll2.c b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
index a6db10717d5c..02a8be2faed7 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_ll2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_ll2.c
@@ -1517,7 +1517,7 @@ static void qed_ll2_register_cb_ops(struct qed_dev *cdev,
 static int qed_ll2_start(struct qed_dev *cdev, struct qed_ll2_params *params)
 {
 	struct qed_ll2_info ll2_info;
-	struct qed_ll2_buffer *buffer;
+	struct qed_ll2_buffer *buffer, *tmp_buffer;
 	enum qed_ll2_conn_type conn_type;
 	struct qed_ptt *p_ptt;
 	int rc, i;
@@ -1587,7 +1587,7 @@ static int qed_ll2_start(struct qed_dev *cdev, struct qed_ll2_params *params)
 
 	/* Post all Rx buffers to FW */
 	spin_lock_bh(&cdev->ll2->lock);
-	list_for_each_entry(buffer, &cdev->ll2->list, list) {
+	list_for_each_entry_safe(buffer, tmp_buffer, &cdev->ll2->list, list) {
 		rc = qed_ll2_post_rx_buffer(QED_LEADING_HWFN(cdev),
 					    cdev->ll2->handle,
 					    buffer->phys_addr, 0, buffer, 1);
diff --git a/drivers/net/ethernet/qlogic/qed/qed_roce.c b/drivers/net/ethernet/qlogic/qed/qed_roce.c
index 23430059471c..76831a398bed 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_roce.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_roce.c
@@ -2947,7 +2947,7 @@ static const struct qed_rdma_ops qed_rdma_ops_pass = {
 	.roce_ll2_stats = &qed_roce_ll2_stats,
 };
 
-const struct qed_rdma_ops *qed_get_rdma_ops()
+const struct qed_rdma_ops *qed_get_rdma_ops(void)
 {
 	return &qed_rdma_ops_pass;
 }
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 1a06c87e3935..da0b80a1917a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -652,20 +652,27 @@ static int stmmac_init_ptp(struct stmmac_priv *priv)
 	if (IS_ERR(priv->clk_ptp_ref)) {
 		priv->clk_ptp_rate = clk_get_rate(priv->stmmac_clk);
 		priv->clk_ptp_ref = NULL;
+		netdev_dbg(priv->dev, "PTP uses main clock\n");
 	} else {
 		clk_prepare_enable(priv->clk_ptp_ref);
 		priv->clk_ptp_rate = clk_get_rate(priv->clk_ptp_ref);
+		netdev_dbg(priv->dev, "PTP rate %d\n", priv->clk_ptp_rate);
 	}
 
 	priv->adv_ts = 0;
-	if (priv->dma_cap.atime_stamp && priv->extend_desc)
+	/* Check if adv_ts can be enabled for dwmac 4.x core */
+	if (priv->plat->has_gmac4 && priv->dma_cap.atime_stamp)
+		priv->adv_ts = 1;
+	/* Dwmac 3.x core with extend_desc can support adv_ts */
+	else if (priv->extend_desc && priv->dma_cap.atime_stamp)
 		priv->adv_ts = 1;
 
-	if (netif_msg_hw(priv) && priv->dma_cap.time_stamp)
-		pr_debug("IEEE 1588-2002 Time Stamp supported\n");
+	if (priv->dma_cap.time_stamp)
+		netdev_info(priv->dev, "IEEE 1588-2002 Timestamp supported\n");
 
-	if (netif_msg_hw(priv) && priv->adv_ts)
-		pr_debug("IEEE 1588-2008 Advanced Time Stamp supported\n");
+	if (priv->adv_ts)
+		netdev_info(priv->dev,
+			    "IEEE 1588-2008 Advanced Timestamp supported\n");
 
 	priv->hw->ptp = &stmmac_ptp;
 	priv->hwts_tx_en = 0;
@@ -1702,8 +1709,8 @@ static int stmmac_hw_setup(struct net_device *dev, bool init_ptp)
 
 	if (init_ptp) {
 		ret = stmmac_init_ptp(priv);
-		if (ret && ret != -EOPNOTSUPP)
-			pr_warn("%s: failed PTP initialisation\n", __func__);
+		if (ret)
+			netdev_warn(priv->dev, "PTP support cannot init.\n");
 	}
 
 #ifdef CONFIG_DEBUG_FS
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
index 6e3b82972ce8..289d52725a6c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ptp.c
@@ -186,10 +186,12 @@ int stmmac_ptp_register(struct stmmac_priv *priv)
 					     priv->device);
 	if (IS_ERR(priv->ptp_clock)) {
 		priv->ptp_clock = NULL;
-		pr_err("ptp_clock_register() failed on %s\n", priv->dev->name);
-	} else if (priv->ptp_clock)
-		pr_debug("Added PTP HW clock successfully on %s\n",
-			 priv->dev->name);
+		return PTR_ERR(priv->ptp_clock);
+	}
+
+	spin_lock_init(&priv->ptp_lock);
+
+	netdev_dbg(priv->dev, "Added PTP HW clock successfully\n");
 
 	return 0;
 }
diff --git a/drivers/net/ethernet/ti/tlan.c b/drivers/net/ethernet/ti/tlan.c
index 4a3eeb10d45b..c8d53d8c83ee 100644
--- a/drivers/net/ethernet/ti/tlan.c
+++ b/drivers/net/ethernet/ti/tlan.c
@@ -610,8 +610,8 @@ err_out_regions:
 #ifdef CONFIG_PCI
 	if (pdev)
 		pci_release_regions(pdev);
-#endif
 err_out:
+#endif
 	if (pdev)
 		pci_disable_device(pdev);
 	return rc;
diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
index 35f9f9742a48..c688d68c39aa 100644
--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
@@ -431,8 +431,7 @@ static void axienet_setoptions(struct net_device *ndev, u32 options)
 	lp->options |= options;
 }
 
-static void __axienet_device_reset(struct axienet_local *lp,
-				   struct device *dev, off_t offset)
+static void __axienet_device_reset(struct axienet_local *lp, off_t offset)
 {
 	u32 timeout;
 	/* Reset Axi DMA. This would reset Axi Ethernet core as well. The reset
@@ -468,8 +467,8 @@ static void axienet_device_reset(struct net_device *ndev)
 	u32 axienet_status;
 	struct axienet_local *lp = netdev_priv(ndev);
 
-	__axienet_device_reset(lp, &ndev->dev, XAXIDMA_TX_CR_OFFSET);
-	__axienet_device_reset(lp, &ndev->dev, XAXIDMA_RX_CR_OFFSET);
+	__axienet_device_reset(lp, XAXIDMA_TX_CR_OFFSET);
+	__axienet_device_reset(lp, XAXIDMA_RX_CR_OFFSET);
 
 	lp->max_frm_size = XAE_MAX_VLAN_FRAME_SIZE;
 	lp->options |= XAE_OPTION_VLAN;
@@ -1338,8 +1337,8 @@ static void axienet_dma_err_handler(unsigned long data)
 	axienet_iow(lp, XAE_MDIO_MC_OFFSET, (mdio_mcreg &
 		    ~XAE_MDIO_MC_MDIOEN_MASK));
 
-	__axienet_device_reset(lp, &ndev->dev, XAXIDMA_TX_CR_OFFSET);
-	__axienet_device_reset(lp, &ndev->dev, XAXIDMA_RX_CR_OFFSET);
+	__axienet_device_reset(lp, XAXIDMA_TX_CR_OFFSET);
+	__axienet_device_reset(lp, XAXIDMA_RX_CR_OFFSET);
 
 	axienet_iow(lp, XAE_MDIO_MC_OFFSET, mdio_mcreg);
 	axienet_mdio_wait_until_ready(lp);
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 52eeb2f67276..f0919bd3a563 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -442,8 +442,6 @@ static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *net)
 	}
 
 	net_trans_info = get_net_transport_info(skb, &hdr_offset);
-	if (net_trans_info == TRANSPORT_INFO_NOT_IP)
-		goto do_send;
 
 	/*
 	 * Setup the sendside checksum offload only if this is not a
@@ -478,56 +476,29 @@ static int netvsc_start_xmit(struct sk_buff *skb, struct net_device *net)
 		}
 		lso_info->lso_v2_transmit.tcp_header_offset = hdr_offset;
 		lso_info->lso_v2_transmit.mss = skb_shinfo(skb)->gso_size;
-		goto do_send;
-	}
-
-	if ((skb->ip_summed == CHECKSUM_NONE) ||
-	    (skb->ip_summed == CHECKSUM_UNNECESSARY))
-		goto do_send;
-
-	rndis_msg_size += NDIS_CSUM_PPI_SIZE;
-	ppi = init_ppi_data(rndis_msg, NDIS_CSUM_PPI_SIZE,
-			    TCPIP_CHKSUM_PKTINFO);
-
-	csum_info = (struct ndis_tcp_ip_checksum_info *)((void *)ppi +
-			ppi->ppi_offset);
-
-	if (net_trans_info & (INFO_IPV4 << 16))
-		csum_info->transmit.is_ipv4 = 1;
-	else
-		csum_info->transmit.is_ipv6 = 1;
-
-	if (net_trans_info & INFO_TCP) {
-		csum_info->transmit.tcp_checksum = 1;
-		csum_info->transmit.tcp_header_offset = hdr_offset;
-	} else if (net_trans_info & INFO_UDP) {
-		/* UDP checksum offload is not supported on ws2008r2.
-		 * Furthermore, on ws2012 and ws2012r2, there are some
-		 * issues with udp checksum offload from Linux guests.
-		 * (these are host issues).
-		 * For now compute the checksum here.
-		 */
-		struct udphdr *uh;
-		u16 udp_len;
-
-		ret = skb_cow_head(skb, 0);
-		if (ret)
-			goto no_memory;
-
-		uh = udp_hdr(skb);
-		udp_len = ntohs(uh->len);
-		uh->check = 0;
-		uh->check = csum_tcpudp_magic(ip_hdr(skb)->saddr,
-					      ip_hdr(skb)->daddr,
-					      udp_len, IPPROTO_UDP,
-					      csum_partial(uh, udp_len, 0));
-		if (uh->check == 0)
-			uh->check = CSUM_MANGLED_0;
-
-		csum_info->transmit.udp_checksum = 0;
+	} else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+		if (net_trans_info & INFO_TCP) {
+			rndis_msg_size += NDIS_CSUM_PPI_SIZE;
+			ppi = init_ppi_data(rndis_msg, NDIS_CSUM_PPI_SIZE,
+					    TCPIP_CHKSUM_PKTINFO);
+
+			csum_info = (struct ndis_tcp_ip_checksum_info *)((void *)ppi +
+									 ppi->ppi_offset);
+
+			if (net_trans_info & (INFO_IPV4 << 16))
+				csum_info->transmit.is_ipv4 = 1;
+			else
+				csum_info->transmit.is_ipv6 = 1;
+
+			csum_info->transmit.tcp_checksum = 1;
+			csum_info->transmit.tcp_header_offset = hdr_offset;
+		} else {
+			/* UDP checksum (and other) offload is not supported. */
+			if (skb_checksum_help(skb))
+				goto drop;
+		}
 	}
 
-do_send:
 	/* Start filling in the page buffers with the rndis hdr */
 	rndis_msg->msg_len += rndis_msg_size;
 	packet->total_data_buflen = rndis_msg->msg_len;
diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c
index c6f66832a1a6..f424b867f73e 100644
--- a/drivers/net/phy/phy.c
+++ b/drivers/net/phy/phy.c
@@ -608,6 +608,21 @@ void phy_start_machine(struct phy_device *phydev)
 }
 
 /**
+ * phy_trigger_machine - trigger the state machine to run
+ *
+ * @phydev: the phy_device struct
+ *
+ * Description: There has been a change in state which requires that the
+ *   state machine runs.
+ */
+
+static void phy_trigger_machine(struct phy_device *phydev)
+{
+	cancel_delayed_work_sync(&phydev->state_queue);
+	queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, 0);
+}
+
+/**
  * phy_stop_machine - stop the PHY state machine tracking
  * @phydev: target phy_device struct
  *
@@ -639,6 +654,8 @@ static void phy_error(struct phy_device *phydev)
 	mutex_lock(&phydev->lock);
 	phydev->state = PHY_HALTED;
 	mutex_unlock(&phydev->lock);
+
+	phy_trigger_machine(phydev);
 }
 
 /**
@@ -800,8 +817,7 @@ void phy_change(struct work_struct *work)
 	}
 
 	/* reschedule state queue work to run as soon as possible */
-	cancel_delayed_work_sync(&phydev->state_queue);
-	queue_delayed_work(system_power_efficient_wq, &phydev->state_queue, 0);
+	phy_trigger_machine(phydev);
 	return;
 
 ignore:
@@ -890,6 +906,8 @@ void phy_start(struct phy_device *phydev)
 	/* if phy was suspended, bring the physical link up again */
 	if (do_resume)
 		phy_resume(phydev);
+
+	phy_trigger_machine(phydev);
 }
 EXPORT_SYMBOL(phy_start);
 
diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c
index 9d1fce8a6e84..3ff76c6db4f6 100644
--- a/drivers/net/usb/qmi_wwan.c
+++ b/drivers/net/usb/qmi_wwan.c
@@ -59,6 +59,10 @@ enum qmi_wwan_flags {
 	QMI_WWAN_FLAG_RAWIP = 1 << 0,
 };
 
+enum qmi_wwan_quirks {
+	QMI_WWAN_QUIRK_DTR = 1 << 0,	/* needs "set DTR" request */
+};
+
 static void qmi_wwan_netdev_setup(struct net_device *net)
 {
 	struct usbnet *dev = netdev_priv(net);
@@ -411,9 +415,14 @@ static int qmi_wwan_bind(struct usbnet *dev, struct usb_interface *intf)
 	 * clearing out state the clients might need.
 	 *
 	 * MDM9x30 is the first QMI chipset with USB3 support. Abuse
-	 * this fact to enable the quirk.
+	 * this fact to enable the quirk for all USB3 devices.
+	 *
+	 * There are also chipsets with the same "set DTR" requirement
+	 * but without USB3 support.  Devices based on these chips
+	 * need a quirk flag in the device ID table.
 	 */
-	if (le16_to_cpu(dev->udev->descriptor.bcdUSB) >= 0x0201) {
+	if (dev->driver_info->data & QMI_WWAN_QUIRK_DTR ||
+	    le16_to_cpu(dev->udev->descriptor.bcdUSB) >= 0x0201) {
 		qmi_wwan_manage_power(dev, 1);
 		qmi_wwan_change_dtr(dev, true);
 	}
@@ -526,6 +535,16 @@ static const struct driver_info	qmi_wwan_info = {
 	.rx_fixup       = qmi_wwan_rx_fixup,
 };
 
+static const struct driver_info	qmi_wwan_info_quirk_dtr = {
+	.description	= "WWAN/QMI device",
+	.flags		= FLAG_WWAN,
+	.bind		= qmi_wwan_bind,
+	.unbind		= qmi_wwan_unbind,
+	.manage_power	= qmi_wwan_manage_power,
+	.rx_fixup       = qmi_wwan_rx_fixup,
+	.data           = QMI_WWAN_QUIRK_DTR,
+};
+
 #define HUAWEI_VENDOR_ID	0x12D1
 
 /* map QMI/wwan function by a fixed interface number */
@@ -533,6 +552,11 @@ static const struct driver_info	qmi_wwan_info = {
 	USB_DEVICE_INTERFACE_NUMBER(vend, prod, num), \
 	.driver_info = (unsigned long)&qmi_wwan_info
 
+/* devices requiring "set DTR" quirk */
+#define QMI_QUIRK_SET_DTR(vend, prod, num) \
+	USB_DEVICE_INTERFACE_NUMBER(vend, prod, num), \
+	.driver_info = (unsigned long)&qmi_wwan_info_quirk_dtr
+
 /* Gobi 1000 QMI/wwan interface number is 3 according to qcserial */
 #define QMI_GOBI1K_DEVICE(vend, prod) \
 	QMI_FIXED_INTF(vend, prod, 3)
@@ -895,6 +919,8 @@ static const struct usb_device_id products[] = {
 	{QMI_FIXED_INTF(0x03f0, 0x4e1d, 8)},	/* HP lt4111 LTE/EV-DO/HSPA+ Gobi 4G Module */
 	{QMI_FIXED_INTF(0x22de, 0x9061, 3)},	/* WeTelecom WPD-600N */
 	{QMI_FIXED_INTF(0x1e0e, 0x9001, 5)},	/* SIMCom 7230E */
+	{QMI_QUIRK_SET_DTR(0x2c7c, 0x0125, 4)},	/* Quectel EC25, EC20 R2.0  Mini PCIe */
+	{QMI_QUIRK_SET_DTR(0x2c7c, 0x0121, 4)},	/* Quectel EC21 Mini PCIe */
 
 	/* 4. Gobi 1000 devices */
 	{QMI_GOBI1K_DEVICE(0x05c6, 0x9212)},	/* Acer Gobi Modem Device */
diff --git a/drivers/net/xen-netback/common.h b/drivers/net/xen-netback/common.h
index cf68149cbb55..3ce1f7da8647 100644
--- a/drivers/net/xen-netback/common.h
+++ b/drivers/net/xen-netback/common.h
@@ -407,4 +407,8 @@ u32 xenvif_set_hash_mapping(struct xenvif *vif, u32 gref, u32 len,
 
 void xenvif_set_skb_hash(struct xenvif *vif, struct sk_buff *skb);
 
+#ifdef CONFIG_DEBUG_FS
+void xenvif_dump_hash_info(struct xenvif *vif, struct seq_file *m);
+#endif
+
 #endif /* __XEN_NETBACK__COMMON_H__ */
diff --git a/drivers/net/xen-netback/hash.c b/drivers/net/xen-netback/hash.c
index 613bac057650..e8c5dddc54ba 100644
--- a/drivers/net/xen-netback/hash.c
+++ b/drivers/net/xen-netback/hash.c
@@ -360,6 +360,74 @@ u32 xenvif_set_hash_mapping(struct xenvif *vif, u32 gref, u32 len,
 	return XEN_NETIF_CTRL_STATUS_SUCCESS;
 }
 
+#ifdef CONFIG_DEBUG_FS
+void xenvif_dump_hash_info(struct xenvif *vif, struct seq_file *m)
+{
+	unsigned int i;
+
+	switch (vif->hash.alg) {
+	case XEN_NETIF_CTRL_HASH_ALGORITHM_TOEPLITZ:
+		seq_puts(m, "Hash Algorithm: TOEPLITZ\n");
+		break;
+
+	case XEN_NETIF_CTRL_HASH_ALGORITHM_NONE:
+		seq_puts(m, "Hash Algorithm: NONE\n");
+		/* FALLTHRU */
+	default:
+		return;
+	}
+
+	if (vif->hash.flags) {
+		seq_puts(m, "\nHash Flags:\n");
+
+		if (vif->hash.flags & XEN_NETIF_CTRL_HASH_TYPE_IPV4)
+			seq_puts(m, "- IPv4\n");
+		if (vif->hash.flags & XEN_NETIF_CTRL_HASH_TYPE_IPV4_TCP)
+			seq_puts(m, "- IPv4 + TCP\n");
+		if (vif->hash.flags & XEN_NETIF_CTRL_HASH_TYPE_IPV6)
+			seq_puts(m, "- IPv6\n");
+		if (vif->hash.flags & XEN_NETIF_CTRL_HASH_TYPE_IPV6_TCP)
+			seq_puts(m, "- IPv6 + TCP\n");
+	}
+
+	seq_puts(m, "\nHash Key:\n");
+
+	for (i = 0; i < XEN_NETBK_MAX_HASH_KEY_SIZE; ) {
+		unsigned int j, n;
+
+		n = 8;
+		if (i + n >= XEN_NETBK_MAX_HASH_KEY_SIZE)
+			n = XEN_NETBK_MAX_HASH_KEY_SIZE - i;
+
+		seq_printf(m, "[%2u - %2u]: ", i, i + n - 1);
+
+		for (j = 0; j < n; j++, i++)
+			seq_printf(m, "%02x ", vif->hash.key[i]);
+
+		seq_puts(m, "\n");
+	}
+
+	if (vif->hash.size != 0) {
+		seq_puts(m, "\nHash Mapping:\n");
+
+		for (i = 0; i < vif->hash.size; ) {
+			unsigned int j, n;
+
+			n = 8;
+			if (i + n >= vif->hash.size)
+				n = vif->hash.size - i;
+
+			seq_printf(m, "[%4u - %4u]: ", i, i + n - 1);
+
+			for (j = 0; j < n; j++, i++)
+				seq_printf(m, "%4u ", vif->hash.mapping[i]);
+
+			seq_puts(m, "\n");
+		}
+	}
+}
+#endif /* CONFIG_DEBUG_FS */
+
 void xenvif_init_hash(struct xenvif *vif)
 {
 	if (xenvif_hash_cache_size == 0)
diff --git a/drivers/net/xen-netback/rx.c b/drivers/net/xen-netback/rx.c
index 8e9ade6ccf18..b1cf7c6f407a 100644
--- a/drivers/net/xen-netback/rx.c
+++ b/drivers/net/xen-netback/rx.c
@@ -337,9 +337,9 @@ static void xenvif_rx_next_chunk(struct xenvif_queue *queue,
 	frag_data += pkt->frag_offset;
 	frag_len -= pkt->frag_offset;
 
-	chunk_len = min(frag_len, XEN_PAGE_SIZE - offset);
-	chunk_len = min(chunk_len,
-			XEN_PAGE_SIZE -	xen_offset_in_page(frag_data));
+	chunk_len = min_t(size_t, frag_len, XEN_PAGE_SIZE - offset);
+	chunk_len = min_t(size_t, chunk_len, XEN_PAGE_SIZE -
+					     xen_offset_in_page(frag_data));
 
 	pkt->frag_offset += chunk_len;
 
@@ -425,6 +425,8 @@ void xenvif_rx_skb(struct xenvif_queue *queue)
 
 	xenvif_rx_next_skb(queue, &pkt);
 
+	queue->last_rx_time = jiffies;
+
 	do {
 		struct xen_netif_rx_request *req;
 		struct xen_netif_rx_response *rsp;
diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c
index 7056404e3cb8..8674e188b697 100644
--- a/drivers/net/xen-netback/xenbus.c
+++ b/drivers/net/xen-netback/xenbus.c
@@ -165,7 +165,7 @@ xenvif_write_io_ring(struct file *filp, const char __user *buf, size_t count,
 	return count;
 }
 
-static int xenvif_dump_open(struct inode *inode, struct file *filp)
+static int xenvif_io_ring_open(struct inode *inode, struct file *filp)
 {
 	int ret;
 	void *queue = NULL;
@@ -179,13 +179,35 @@ static int xenvif_dump_open(struct inode *inode, struct file *filp)
 
 static const struct file_operations xenvif_dbg_io_ring_ops_fops = {
 	.owner = THIS_MODULE,
-	.open = xenvif_dump_open,
+	.open = xenvif_io_ring_open,
 	.read = seq_read,
 	.llseek = seq_lseek,
 	.release = single_release,
 	.write = xenvif_write_io_ring,
 };
 
+static int xenvif_read_ctrl(struct seq_file *m, void *v)
+{
+	struct xenvif *vif = m->private;
+
+	xenvif_dump_hash_info(vif, m);
+
+	return 0;
+}
+
+static int xenvif_ctrl_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, xenvif_read_ctrl, inode->i_private);
+}
+
+static const struct file_operations xenvif_dbg_ctrl_ops_fops = {
+	.owner = THIS_MODULE,
+	.open = xenvif_ctrl_open,
+	.read = seq_read,
+	.llseek = seq_lseek,
+	.release = single_release,
+};
+
 static void xenvif_debugfs_addif(struct xenvif *vif)
 {
 	struct dentry *pfile;
@@ -210,6 +232,17 @@ static void xenvif_debugfs_addif(struct xenvif *vif)
 				pr_warn("Creation of io_ring file returned %ld!\n",
 					PTR_ERR(pfile));
 		}
+
+		if (vif->ctrl_irq) {
+			pfile = debugfs_create_file("ctrl",
+						    S_IRUSR,
+						    vif->xenvif_dbg_root,
+						    vif,
+						    &xenvif_dbg_ctrl_ops_fops);
+			if (IS_ERR_OR_NULL(pfile))
+				pr_warn("Creation of ctrl file returned %ld!\n",
+					PTR_ERR(pfile));
+		}
 	} else
 		netdev_warn(vif->dev,
 			    "Creation of vif debugfs dir returned %ld!\n",
diff --git a/drivers/pci/host/pci-aardvark.c b/drivers/pci/host/pci-aardvark.c
index e4a5b7ee90cf..4fce494271cc 100644
--- a/drivers/pci/host/pci-aardvark.c
+++ b/drivers/pci/host/pci-aardvark.c
@@ -230,20 +230,20 @@ static int advk_pcie_link_up(struct advk_pcie *pcie)
 
 static int advk_pcie_wait_for_link(struct advk_pcie *pcie)
 {
+	struct device *dev = &pcie->pdev->dev;
 	int retries;
 
 	/* check if the link is up or not */
 	for (retries = 0; retries < LINK_WAIT_MAX_RETRIES; retries++) {
 		if (advk_pcie_link_up(pcie)) {
-			dev_info(&pcie->pdev->dev, "link up\n");
+			dev_info(dev, "link up\n");
 			return 0;
 		}
 
 		usleep_range(LINK_WAIT_USLEEP_MIN, LINK_WAIT_USLEEP_MAX);
 	}
 
-	dev_err(&pcie->pdev->dev, "link never came up\n");
-
+	dev_err(dev, "link never came up\n");
 	return -ETIMEDOUT;
 }
 
@@ -376,6 +376,7 @@ static void advk_pcie_setup_hw(struct advk_pcie *pcie)
 
 static void advk_pcie_check_pio_status(struct advk_pcie *pcie)
 {
+	struct device *dev = &pcie->pdev->dev;
 	u32 reg;
 	unsigned int status;
 	char *strcomp_status, *str_posted;
@@ -407,12 +408,13 @@ static void advk_pcie_check_pio_status(struct advk_pcie *pcie)
 	else
 		str_posted = "Posted";
 
-	dev_err(&pcie->pdev->dev, "%s PIO Response Status: %s, %#x @ %#x\n",
+	dev_err(dev, "%s PIO Response Status: %s, %#x @ %#x\n",
 		str_posted, strcomp_status, reg, advk_readl(pcie, PIO_ADDR_LS));
 }
 
 static int advk_pcie_wait_pio(struct advk_pcie *pcie)
 {
+	struct device *dev = &pcie->pdev->dev;
 	unsigned long timeout;
 
 	timeout = jiffies + msecs_to_jiffies(PIO_TIMEOUT_MS);
@@ -426,7 +428,7 @@ static int advk_pcie_wait_pio(struct advk_pcie *pcie)
 			return 0;
 	}
 
-	dev_err(&pcie->pdev->dev, "config read/write timed out\n");
+	dev_err(dev, "config read/write timed out\n");
 	return -ETIMEDOUT;
 }
 
@@ -560,10 +562,11 @@ static int advk_pcie_alloc_msi(struct advk_pcie *pcie)
 
 static void advk_pcie_free_msi(struct advk_pcie *pcie, int hwirq)
 {
+	struct device *dev = &pcie->pdev->dev;
+
 	mutex_lock(&pcie->msi_used_lock);
 	if (!test_bit(hwirq, pcie->msi_irq_in_use))
-		dev_err(&pcie->pdev->dev, "trying to free unused MSI#%d\n",
-			hwirq);
+		dev_err(dev, "trying to free unused MSI#%d\n", hwirq);
 	else
 		clear_bit(hwirq, pcie->msi_irq_in_use);
 	mutex_unlock(&pcie->msi_used_lock);
@@ -910,6 +913,7 @@ out_release_res:
 
 static int advk_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct advk_pcie *pcie;
 	struct resource *res;
 	struct pci_bus *bus, *child;
@@ -917,31 +921,29 @@ static int advk_pcie_probe(struct platform_device *pdev)
 	struct device_node *msi_node;
 	int ret, irq;
 
-	pcie = devm_kzalloc(&pdev->dev, sizeof(struct advk_pcie),
-			    GFP_KERNEL);
+	pcie = devm_kzalloc(dev, sizeof(struct advk_pcie), GFP_KERNEL);
 	if (!pcie)
 		return -ENOMEM;
 
 	pcie->pdev = pdev;
-	platform_set_drvdata(pdev, pcie);
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	pcie->base = devm_ioremap_resource(&pdev->dev, res);
+	pcie->base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(pcie->base))
 		return PTR_ERR(pcie->base);
 
 	irq = platform_get_irq(pdev, 0);
-	ret = devm_request_irq(&pdev->dev, irq, advk_pcie_irq_handler,
+	ret = devm_request_irq(dev, irq, advk_pcie_irq_handler,
 			       IRQF_SHARED | IRQF_NO_THREAD, "advk-pcie",
 			       pcie);
 	if (ret) {
-		dev_err(&pdev->dev, "Failed to register interrupt\n");
+		dev_err(dev, "Failed to register interrupt\n");
 		return ret;
 	}
 
 	ret = advk_pcie_parse_request_of_pci_ranges(pcie);
 	if (ret) {
-		dev_err(&pdev->dev, "Failed to parse resources\n");
+		dev_err(dev, "Failed to parse resources\n");
 		return ret;
 	}
 
@@ -949,24 +951,24 @@ static int advk_pcie_probe(struct platform_device *pdev)
 
 	ret = advk_pcie_init_irq_domain(pcie);
 	if (ret) {
-		dev_err(&pdev->dev, "Failed to initialize irq\n");
+		dev_err(dev, "Failed to initialize irq\n");
 		return ret;
 	}
 
 	ret = advk_pcie_init_msi_irq_domain(pcie);
 	if (ret) {
-		dev_err(&pdev->dev, "Failed to initialize irq\n");
+		dev_err(dev, "Failed to initialize irq\n");
 		advk_pcie_remove_irq_domain(pcie);
 		return ret;
 	}
 
-	msi_node = of_parse_phandle(pdev->dev.of_node, "msi-parent", 0);
+	msi_node = of_parse_phandle(dev->of_node, "msi-parent", 0);
 	if (msi_node)
 		msi = of_pci_find_msi_chip_by_node(msi_node);
 	else
 		msi = NULL;
 
-	bus = pci_scan_root_bus_msi(&pdev->dev, 0, &advk_pcie_ops,
+	bus = pci_scan_root_bus_msi(dev, 0, &advk_pcie_ops,
 				    pcie, &pcie->resources, &pcie->msi);
 	if (!bus) {
 		advk_pcie_remove_msi_irq_domain(pcie);
@@ -980,7 +982,6 @@ static int advk_pcie_probe(struct platform_device *pdev)
 		pcie_bus_configure_settings(child);
 
 	pci_bus_add_devices(bus);
-
 	return 0;
 }
 
diff --git a/drivers/pci/host/pci-dra7xx.c b/drivers/pci/host/pci-dra7xx.c
index 19223ed2e619..9595fad63f6f 100644
--- a/drivers/pci/host/pci-dra7xx.c
+++ b/drivers/pci/host/pci-dra7xx.c
@@ -64,11 +64,10 @@
 #define	DRA7XX_CPU_TO_BUS_ADDR				0x0FFFFFFF
 
 struct dra7xx_pcie {
-	void __iomem		*base;
-	struct phy		**phy;
-	int			phy_count;
-	struct device		*dev;
 	struct pcie_port	pp;
+	void __iomem		*base;		/* DT ti_conf */
+	int			phy_count;	/* DT phy-names count */
+	struct phy		**phy;
 };
 
 #define to_dra7xx_pcie(x)	container_of((x), struct dra7xx_pcie, pp)
@@ -84,17 +83,6 @@ static inline void dra7xx_pcie_writel(struct dra7xx_pcie *pcie, u32 offset,
 	writel(value, pcie->base + offset);
 }
 
-static inline u32 dra7xx_pcie_readl_rc(struct pcie_port *pp, u32 offset)
-{
-	return readl(pp->dbi_base + offset);
-}
-
-static inline void dra7xx_pcie_writel_rc(struct pcie_port *pp, u32 offset,
-					 u32 value)
-{
-	writel(value, pp->dbi_base + offset);
-}
-
 static int dra7xx_pcie_link_up(struct pcie_port *pp)
 {
 	struct dra7xx_pcie *dra7xx = to_dra7xx_pcie(pp);
@@ -103,13 +91,14 @@ static int dra7xx_pcie_link_up(struct pcie_port *pp)
 	return !!(reg & LINK_UP);
 }
 
-static int dra7xx_pcie_establish_link(struct pcie_port *pp)
+static int dra7xx_pcie_establish_link(struct dra7xx_pcie *dra7xx)
 {
-	struct dra7xx_pcie *dra7xx = to_dra7xx_pcie(pp);
+	struct pcie_port *pp = &dra7xx->pp;
+	struct device *dev = pp->dev;
 	u32 reg;
 
 	if (dw_pcie_link_up(pp)) {
-		dev_err(pp->dev, "link is already up\n");
+		dev_err(dev, "link is already up\n");
 		return 0;
 	}
 
@@ -120,10 +109,8 @@ static int dra7xx_pcie_establish_link(struct pcie_port *pp)
 	return dw_pcie_wait_for_link(pp);
 }
 
-static void dra7xx_pcie_enable_interrupts(struct pcie_port *pp)
+static void dra7xx_pcie_enable_interrupts(struct dra7xx_pcie *dra7xx)
 {
-	struct dra7xx_pcie *dra7xx = to_dra7xx_pcie(pp);
-
 	dra7xx_pcie_writel(dra7xx, PCIECTRL_DRA7XX_CONF_IRQSTATUS_MAIN,
 			   ~INTERRUPTS);
 	dra7xx_pcie_writel(dra7xx,
@@ -142,6 +129,8 @@ static void dra7xx_pcie_enable_interrupts(struct pcie_port *pp)
 
 static void dra7xx_pcie_host_init(struct pcie_port *pp)
 {
+	struct dra7xx_pcie *dra7xx = to_dra7xx_pcie(pp);
+
 	pp->io_base &= DRA7XX_CPU_TO_BUS_ADDR;
 	pp->mem_base &= DRA7XX_CPU_TO_BUS_ADDR;
 	pp->cfg0_base &= DRA7XX_CPU_TO_BUS_ADDR;
@@ -149,10 +138,10 @@ static void dra7xx_pcie_host_init(struct pcie_port *pp)
 
 	dw_pcie_setup_rc(pp);
 
-	dra7xx_pcie_establish_link(pp);
+	dra7xx_pcie_establish_link(dra7xx);
 	if (IS_ENABLED(CONFIG_PCI_MSI))
 		dw_pcie_msi_init(pp);
-	dra7xx_pcie_enable_interrupts(pp);
+	dra7xx_pcie_enable_interrupts(dra7xx);
 }
 
 static struct pcie_host_ops dra7xx_pcie_host_ops = {
@@ -196,8 +185,8 @@ static int dra7xx_pcie_init_irq_domain(struct pcie_port *pp)
 
 static irqreturn_t dra7xx_pcie_msi_irq_handler(int irq, void *arg)
 {
-	struct pcie_port *pp = arg;
-	struct dra7xx_pcie *dra7xx = to_dra7xx_pcie(pp);
+	struct dra7xx_pcie *dra7xx = arg;
+	struct pcie_port *pp = &dra7xx->pp;
 	u32 reg;
 
 	reg = dra7xx_pcie_readl(dra7xx, PCIECTRL_DRA7XX_CONF_IRQSTATUS_MSI);
@@ -223,51 +212,51 @@ static irqreturn_t dra7xx_pcie_msi_irq_handler(int irq, void *arg)
 static irqreturn_t dra7xx_pcie_irq_handler(int irq, void *arg)
 {
 	struct dra7xx_pcie *dra7xx = arg;
+	struct device *dev = dra7xx->pp.dev;
 	u32 reg;
 
 	reg = dra7xx_pcie_readl(dra7xx, PCIECTRL_DRA7XX_CONF_IRQSTATUS_MAIN);
 
 	if (reg & ERR_SYS)
-		dev_dbg(dra7xx->dev, "System Error\n");
+		dev_dbg(dev, "System Error\n");
 
 	if (reg & ERR_FATAL)
-		dev_dbg(dra7xx->dev, "Fatal Error\n");
+		dev_dbg(dev, "Fatal Error\n");
 
 	if (reg & ERR_NONFATAL)
-		dev_dbg(dra7xx->dev, "Non Fatal Error\n");
+		dev_dbg(dev, "Non Fatal Error\n");
 
 	if (reg & ERR_COR)
-		dev_dbg(dra7xx->dev, "Correctable Error\n");
+		dev_dbg(dev, "Correctable Error\n");
 
 	if (reg & ERR_AXI)
-		dev_dbg(dra7xx->dev, "AXI tag lookup fatal Error\n");
+		dev_dbg(dev, "AXI tag lookup fatal Error\n");
 
 	if (reg & ERR_ECRC)
-		dev_dbg(dra7xx->dev, "ECRC Error\n");
+		dev_dbg(dev, "ECRC Error\n");
 
 	if (reg & PME_TURN_OFF)
-		dev_dbg(dra7xx->dev,
+		dev_dbg(dev,
 			"Power Management Event Turn-Off message received\n");
 
 	if (reg & PME_TO_ACK)
-		dev_dbg(dra7xx->dev,
+		dev_dbg(dev,
 			"Power Management Turn-Off Ack message received\n");
 
 	if (reg & PM_PME)
-		dev_dbg(dra7xx->dev,
-			"PM Power Management Event message received\n");
+		dev_dbg(dev, "PM Power Management Event message received\n");
 
 	if (reg & LINK_REQ_RST)
-		dev_dbg(dra7xx->dev, "Link Request Reset\n");
+		dev_dbg(dev, "Link Request Reset\n");
 
 	if (reg & LINK_UP_EVT)
-		dev_dbg(dra7xx->dev, "Link-up state change\n");
+		dev_dbg(dev, "Link-up state change\n");
 
 	if (reg & CFG_BME_EVT)
-		dev_dbg(dra7xx->dev, "CFG 'Bus Master Enable' change\n");
+		dev_dbg(dev, "CFG 'Bus Master Enable' change\n");
 
 	if (reg & CFG_MSE_EVT)
-		dev_dbg(dra7xx->dev, "CFG 'Memory Space Enable' change\n");
+		dev_dbg(dev, "CFG 'Memory Space Enable' change\n");
 
 	dra7xx_pcie_writel(dra7xx, PCIECTRL_DRA7XX_CONF_IRQSTATUS_MAIN, reg);
 
@@ -278,13 +267,9 @@ static int __init dra7xx_add_pcie_port(struct dra7xx_pcie *dra7xx,
 				       struct platform_device *pdev)
 {
 	int ret;
-	struct pcie_port *pp;
+	struct pcie_port *pp = &dra7xx->pp;
+	struct device *dev = pp->dev;
 	struct resource *res;
-	struct device *dev = &pdev->dev;
-
-	pp = &dra7xx->pp;
-	pp->dev = dev;
-	pp->ops = &dra7xx_pcie_host_ops;
 
 	pp->irq = platform_get_irq(pdev, 1);
 	if (pp->irq < 0) {
@@ -292,12 +277,11 @@ static int __init dra7xx_add_pcie_port(struct dra7xx_pcie *dra7xx,
 		return -EINVAL;
 	}
 
-	ret = devm_request_irq(&pdev->dev, pp->irq,
-			       dra7xx_pcie_msi_irq_handler,
+	ret = devm_request_irq(dev, pp->irq, dra7xx_pcie_msi_irq_handler,
 			       IRQF_SHARED | IRQF_NO_THREAD,
-			       "dra7-pcie-msi",	pp);
+			       "dra7-pcie-msi",	dra7xx);
 	if (ret) {
-		dev_err(&pdev->dev, "failed to request irq\n");
+		dev_err(dev, "failed to request irq\n");
 		return ret;
 	}
 
@@ -314,7 +298,7 @@ static int __init dra7xx_add_pcie_port(struct dra7xx_pcie *dra7xx,
 
 	ret = dw_pcie_host_init(pp);
 	if (ret) {
-		dev_err(dra7xx->dev, "failed to initialize host\n");
+		dev_err(dev, "failed to initialize host\n");
 		return ret;
 	}
 
@@ -332,6 +316,7 @@ static int __init dra7xx_pcie_probe(struct platform_device *pdev)
 	void __iomem *base;
 	struct resource *res;
 	struct dra7xx_pcie *dra7xx;
+	struct pcie_port *pp;
 	struct device *dev = &pdev->dev;
 	struct device_node *np = dev->of_node;
 	char name[10];
@@ -343,6 +328,10 @@ static int __init dra7xx_pcie_probe(struct platform_device *pdev)
 	if (!dra7xx)
 		return -ENOMEM;
 
+	pp = &dra7xx->pp;
+	pp->dev = dev;
+	pp->ops = &dra7xx_pcie_host_ops;
+
 	irq = platform_get_irq(pdev, 0);
 	if (irq < 0) {
 		dev_err(dev, "missing IRQ resource\n");
@@ -390,7 +379,6 @@ static int __init dra7xx_pcie_probe(struct platform_device *pdev)
 
 	dra7xx->base = base;
 	dra7xx->phy = phy;
-	dra7xx->dev = dev;
 	dra7xx->phy_count = phy_count;
 
 	pm_runtime_enable(dev);
@@ -407,7 +395,7 @@ static int __init dra7xx_pcie_probe(struct platform_device *pdev)
 		ret = devm_gpio_request_one(dev, gpio_sel, gpio_flags,
 					    "pcie_reset");
 		if (ret) {
-			dev_err(&pdev->dev, "gpio%d request failed, ret %d\n",
+			dev_err(dev, "gpio%d request failed, ret %d\n",
 				gpio_sel, ret);
 			goto err_gpio;
 		}
@@ -420,12 +408,11 @@ static int __init dra7xx_pcie_probe(struct platform_device *pdev)
 	reg &= ~LTSSM_EN;
 	dra7xx_pcie_writel(dra7xx, PCIECTRL_DRA7XX_CONF_DEVICE_CMD, reg);
 
-	platform_set_drvdata(pdev, dra7xx);
-
 	ret = dra7xx_add_pcie_port(dra7xx, pdev);
 	if (ret < 0)
 		goto err_gpio;
 
+	platform_set_drvdata(pdev, dra7xx);
 	return 0;
 
 err_gpio:
@@ -451,9 +438,9 @@ static int dra7xx_pcie_suspend(struct device *dev)
 	u32 val;
 
 	/* clear MSE */
-	val = dra7xx_pcie_readl_rc(pp, PCI_COMMAND);
+	val = dw_pcie_readl_rc(pp, PCI_COMMAND);
 	val &= ~PCI_COMMAND_MEMORY;
-	dra7xx_pcie_writel_rc(pp, PCI_COMMAND, val);
+	dw_pcie_writel_rc(pp, PCI_COMMAND, val);
 
 	return 0;
 }
@@ -465,9 +452,9 @@ static int dra7xx_pcie_resume(struct device *dev)
 	u32 val;
 
 	/* set MSE */
-	val = dra7xx_pcie_readl_rc(pp, PCI_COMMAND);
+	val = dw_pcie_readl_rc(pp, PCI_COMMAND);
 	val |= PCI_COMMAND_MEMORY;
-	dra7xx_pcie_writel_rc(pp, PCI_COMMAND, val);
+	dw_pcie_writel_rc(pp, PCI_COMMAND, val);
 
 	return 0;
 }
diff --git a/drivers/pci/host/pci-exynos.c b/drivers/pci/host/pci-exynos.c
index 2e2d7f00b9e8..f1c544bb8b68 100644
--- a/drivers/pci/host/pci-exynos.c
+++ b/drivers/pci/host/pci-exynos.c
@@ -29,13 +29,13 @@
 #define to_exynos_pcie(x)	container_of(x, struct exynos_pcie, pp)
 
 struct exynos_pcie {
-	void __iomem		*elbi_base;
-	void __iomem		*phy_base;
-	void __iomem		*block_base;
+	struct pcie_port	pp;
+	void __iomem		*elbi_base;	/* DT 0th resource */
+	void __iomem		*phy_base;	/* DT 1st resource */
+	void __iomem		*block_base;	/* DT 2nd resource */
 	int			reset_gpio;
 	struct clk		*clk;
 	struct clk		*bus_clk;
-	struct pcie_port	pp;
 };
 
 /* PCIe ELBI registers */
@@ -102,40 +102,40 @@ struct exynos_pcie {
 #define PCIE_PHY_TRSV3_PD_TSV		(0x1 << 7)
 #define PCIE_PHY_TRSV3_LVCC		0x31c
 
-static inline void exynos_elb_writel(struct exynos_pcie *pcie, u32 val, u32 reg)
+static void exynos_elb_writel(struct exynos_pcie *exynos_pcie, u32 val, u32 reg)
 {
-	writel(val, pcie->elbi_base + reg);
+	writel(val, exynos_pcie->elbi_base + reg);
 }
 
-static inline u32 exynos_elb_readl(struct exynos_pcie *pcie, u32 reg)
+static u32 exynos_elb_readl(struct exynos_pcie *exynos_pcie, u32 reg)
 {
-	return readl(pcie->elbi_base + reg);
+	return readl(exynos_pcie->elbi_base + reg);
 }
 
-static inline void exynos_phy_writel(struct exynos_pcie *pcie, u32 val, u32 reg)
+static void exynos_phy_writel(struct exynos_pcie *exynos_pcie, u32 val, u32 reg)
 {
-	writel(val, pcie->phy_base + reg);
+	writel(val, exynos_pcie->phy_base + reg);
 }
 
-static inline u32 exynos_phy_readl(struct exynos_pcie *pcie, u32 reg)
+static u32 exynos_phy_readl(struct exynos_pcie *exynos_pcie, u32 reg)
 {
-	return readl(pcie->phy_base + reg);
+	return readl(exynos_pcie->phy_base + reg);
 }
 
-static inline void exynos_blk_writel(struct exynos_pcie *pcie, u32 val, u32 reg)
+static void exynos_blk_writel(struct exynos_pcie *exynos_pcie, u32 val, u32 reg)
 {
-	writel(val, pcie->block_base + reg);
+	writel(val, exynos_pcie->block_base + reg);
 }
 
-static inline u32 exynos_blk_readl(struct exynos_pcie *pcie, u32 reg)
+static u32 exynos_blk_readl(struct exynos_pcie *exynos_pcie, u32 reg)
 {
-	return readl(pcie->block_base + reg);
+	return readl(exynos_pcie->block_base + reg);
 }
 
-static void exynos_pcie_sideband_dbi_w_mode(struct pcie_port *pp, bool on)
+static void exynos_pcie_sideband_dbi_w_mode(struct exynos_pcie *exynos_pcie,
+					    bool on)
 {
 	u32 val;
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 
 	if (on) {
 		val = exynos_elb_readl(exynos_pcie, PCIE_ELBI_SLV_AWMISC);
@@ -148,10 +148,10 @@ static void exynos_pcie_sideband_dbi_w_mode(struct pcie_port *pp, bool on)
 	}
 }
 
-static void exynos_pcie_sideband_dbi_r_mode(struct pcie_port *pp, bool on)
+static void exynos_pcie_sideband_dbi_r_mode(struct exynos_pcie *exynos_pcie,
+					    bool on)
 {
 	u32 val;
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 
 	if (on) {
 		val = exynos_elb_readl(exynos_pcie, PCIE_ELBI_SLV_ARMISC);
@@ -164,10 +164,9 @@ static void exynos_pcie_sideband_dbi_r_mode(struct pcie_port *pp, bool on)
 	}
 }
 
-static void exynos_pcie_assert_core_reset(struct pcie_port *pp)
+static void exynos_pcie_assert_core_reset(struct exynos_pcie *exynos_pcie)
 {
 	u32 val;
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 
 	val = exynos_elb_readl(exynos_pcie, PCIE_CORE_RESET);
 	val &= ~PCIE_CORE_RESET_ENABLE;
@@ -177,10 +176,9 @@ static void exynos_pcie_assert_core_reset(struct pcie_port *pp)
 	exynos_elb_writel(exynos_pcie, 0, PCIE_NONSTICKY_RESET);
 }
 
-static void exynos_pcie_deassert_core_reset(struct pcie_port *pp)
+static void exynos_pcie_deassert_core_reset(struct exynos_pcie *exynos_pcie)
 {
 	u32 val;
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 
 	val = exynos_elb_readl(exynos_pcie, PCIE_CORE_RESET);
 	val |= PCIE_CORE_RESET_ENABLE;
@@ -193,18 +191,14 @@ static void exynos_pcie_deassert_core_reset(struct pcie_port *pp)
 	exynos_blk_writel(exynos_pcie, 1, PCIE_PHY_MAC_RESET);
 }
 
-static void exynos_pcie_assert_phy_reset(struct pcie_port *pp)
+static void exynos_pcie_assert_phy_reset(struct exynos_pcie *exynos_pcie)
 {
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
-
 	exynos_blk_writel(exynos_pcie, 0, PCIE_PHY_MAC_RESET);
 	exynos_blk_writel(exynos_pcie, 1, PCIE_PHY_GLOBAL_RESET);
 }
 
-static void exynos_pcie_deassert_phy_reset(struct pcie_port *pp)
+static void exynos_pcie_deassert_phy_reset(struct exynos_pcie *exynos_pcie)
 {
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
-
 	exynos_blk_writel(exynos_pcie, 0, PCIE_PHY_GLOBAL_RESET);
 	exynos_elb_writel(exynos_pcie, 1, PCIE_PWR_RESET);
 	exynos_blk_writel(exynos_pcie, 0, PCIE_PHY_COMMON_RESET);
@@ -213,10 +207,9 @@ static void exynos_pcie_deassert_phy_reset(struct pcie_port *pp)
 	exynos_blk_writel(exynos_pcie, 0, PCIE_PHY_TRSV_RESET);
 }
 
-static void exynos_pcie_power_on_phy(struct pcie_port *pp)
+static void exynos_pcie_power_on_phy(struct exynos_pcie *exynos_pcie)
 {
 	u32 val;
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 
 	val = exynos_phy_readl(exynos_pcie, PCIE_PHY_COMMON_POWER);
 	val &= ~PCIE_PHY_COMMON_PD_CMN;
@@ -239,10 +232,9 @@ static void exynos_pcie_power_on_phy(struct pcie_port *pp)
 	exynos_phy_writel(exynos_pcie, val, PCIE_PHY_TRSV3_POWER);
 }
 
-static void exynos_pcie_power_off_phy(struct pcie_port *pp)
+static void exynos_pcie_power_off_phy(struct exynos_pcie *exynos_pcie)
 {
 	u32 val;
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 
 	val = exynos_phy_readl(exynos_pcie, PCIE_PHY_COMMON_POWER);
 	val |= PCIE_PHY_COMMON_PD_CMN;
@@ -265,10 +257,8 @@ static void exynos_pcie_power_off_phy(struct pcie_port *pp)
 	exynos_phy_writel(exynos_pcie, val, PCIE_PHY_TRSV3_POWER);
 }
 
-static void exynos_pcie_init_phy(struct pcie_port *pp)
+static void exynos_pcie_init_phy(struct exynos_pcie *exynos_pcie)
 {
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
-
 	/* DCC feedback control off */
 	exynos_phy_writel(exynos_pcie, 0x29, PCIE_PHY_DCC_FEEDBACK);
 
@@ -305,51 +295,41 @@ static void exynos_pcie_init_phy(struct pcie_port *pp)
 	exynos_phy_writel(exynos_pcie, 0xa0, PCIE_PHY_TRSV3_LVCC);
 }
 
-static void exynos_pcie_assert_reset(struct pcie_port *pp)
+static void exynos_pcie_assert_reset(struct exynos_pcie *exynos_pcie)
 {
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
+	struct pcie_port *pp = &exynos_pcie->pp;
+	struct device *dev = pp->dev;
 
 	if (exynos_pcie->reset_gpio >= 0)
-		devm_gpio_request_one(pp->dev, exynos_pcie->reset_gpio,
+		devm_gpio_request_one(dev, exynos_pcie->reset_gpio,
 				GPIOF_OUT_INIT_HIGH, "RESET");
 }
 
-static int exynos_pcie_establish_link(struct pcie_port *pp)
+static int exynos_pcie_establish_link(struct exynos_pcie *exynos_pcie)
 {
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
+	struct pcie_port *pp = &exynos_pcie->pp;
+	struct device *dev = pp->dev;
 	u32 val;
 
 	if (dw_pcie_link_up(pp)) {
-		dev_err(pp->dev, "Link already up\n");
+		dev_err(dev, "Link already up\n");
 		return 0;
 	}
 
-	/* assert reset signals */
-	exynos_pcie_assert_core_reset(pp);
-	exynos_pcie_assert_phy_reset(pp);
-
-	/* de-assert phy reset */
-	exynos_pcie_deassert_phy_reset(pp);
-
-	/* power on phy */
-	exynos_pcie_power_on_phy(pp);
-
-	/* initialize phy */
-	exynos_pcie_init_phy(pp);
+	exynos_pcie_assert_core_reset(exynos_pcie);
+	exynos_pcie_assert_phy_reset(exynos_pcie);
+	exynos_pcie_deassert_phy_reset(exynos_pcie);
+	exynos_pcie_power_on_phy(exynos_pcie);
+	exynos_pcie_init_phy(exynos_pcie);
 
 	/* pulse for common reset */
 	exynos_blk_writel(exynos_pcie, 1, PCIE_PHY_COMMON_RESET);
 	udelay(500);
 	exynos_blk_writel(exynos_pcie, 0, PCIE_PHY_COMMON_RESET);
 
-	/* de-assert core reset */
-	exynos_pcie_deassert_core_reset(pp);
-
-	/* setup root complex */
+	exynos_pcie_deassert_core_reset(exynos_pcie);
 	dw_pcie_setup_rc(pp);
-
-	/* assert reset signal */
-	exynos_pcie_assert_reset(pp);
+	exynos_pcie_assert_reset(exynos_pcie);
 
 	/* assert LTSSM enable */
 	exynos_elb_writel(exynos_pcie, PCIE_ELBI_LTSSM_ENABLE,
@@ -361,27 +341,23 @@ static int exynos_pcie_establish_link(struct pcie_port *pp)
 
 	while (exynos_phy_readl(exynos_pcie, PCIE_PHY_PLL_LOCKED) == 0) {
 		val = exynos_blk_readl(exynos_pcie, PCIE_PHY_PLL_LOCKED);
-		dev_info(pp->dev, "PLL Locked: 0x%x\n", val);
+		dev_info(dev, "PLL Locked: 0x%x\n", val);
 	}
-	/* power off phy */
-	exynos_pcie_power_off_phy(pp);
-
+	exynos_pcie_power_off_phy(exynos_pcie);
 	return -ETIMEDOUT;
 }
 
-static void exynos_pcie_clear_irq_pulse(struct pcie_port *pp)
+static void exynos_pcie_clear_irq_pulse(struct exynos_pcie *exynos_pcie)
 {
 	u32 val;
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 
 	val = exynos_elb_readl(exynos_pcie, PCIE_IRQ_PULSE);
 	exynos_elb_writel(exynos_pcie, val, PCIE_IRQ_PULSE);
 }
 
-static void exynos_pcie_enable_irq_pulse(struct pcie_port *pp)
+static void exynos_pcie_enable_irq_pulse(struct exynos_pcie *exynos_pcie)
 {
 	u32 val;
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 
 	/* enable INTX interrupt */
 	val = IRQ_INTA_ASSERT | IRQ_INTB_ASSERT |
@@ -391,23 +367,24 @@ static void exynos_pcie_enable_irq_pulse(struct pcie_port *pp)
 
 static irqreturn_t exynos_pcie_irq_handler(int irq, void *arg)
 {
-	struct pcie_port *pp = arg;
+	struct exynos_pcie *exynos_pcie = arg;
 
-	exynos_pcie_clear_irq_pulse(pp);
+	exynos_pcie_clear_irq_pulse(exynos_pcie);
 	return IRQ_HANDLED;
 }
 
 static irqreturn_t exynos_pcie_msi_irq_handler(int irq, void *arg)
 {
-	struct pcie_port *pp = arg;
+	struct exynos_pcie *exynos_pcie = arg;
+	struct pcie_port *pp = &exynos_pcie->pp;
 
 	return dw_handle_msi_irq(pp);
 }
 
-static void exynos_pcie_msi_init(struct pcie_port *pp)
+static void exynos_pcie_msi_init(struct exynos_pcie *exynos_pcie)
 {
+	struct pcie_port *pp = &exynos_pcie->pp;
 	u32 val;
-	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 
 	dw_pcie_msi_init(pp);
 
@@ -417,60 +394,64 @@ static void exynos_pcie_msi_init(struct pcie_port *pp)
 	exynos_elb_writel(exynos_pcie, val, PCIE_IRQ_EN_LEVEL);
 }
 
-static void exynos_pcie_enable_interrupts(struct pcie_port *pp)
+static void exynos_pcie_enable_interrupts(struct exynos_pcie *exynos_pcie)
 {
-	exynos_pcie_enable_irq_pulse(pp);
+	exynos_pcie_enable_irq_pulse(exynos_pcie);
 
 	if (IS_ENABLED(CONFIG_PCI_MSI))
-		exynos_pcie_msi_init(pp);
+		exynos_pcie_msi_init(exynos_pcie);
 }
 
-static inline u32 exynos_pcie_readl_rc(struct pcie_port *pp,
-				       void __iomem *dbi_base)
+static u32 exynos_pcie_readl_rc(struct pcie_port *pp, u32 reg)
 {
+	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 	u32 val;
 
-	exynos_pcie_sideband_dbi_r_mode(pp, true);
-	val = readl(dbi_base);
-	exynos_pcie_sideband_dbi_r_mode(pp, false);
+	exynos_pcie_sideband_dbi_r_mode(exynos_pcie, true);
+	val = readl(pp->dbi_base + reg);
+	exynos_pcie_sideband_dbi_r_mode(exynos_pcie, false);
 	return val;
 }
 
-static inline void exynos_pcie_writel_rc(struct pcie_port *pp,
-					u32 val, void __iomem *dbi_base)
+static void exynos_pcie_writel_rc(struct pcie_port *pp, u32 reg, u32 val)
 {
-	exynos_pcie_sideband_dbi_w_mode(pp, true);
-	writel(val, dbi_base);
-	exynos_pcie_sideband_dbi_w_mode(pp, false);
+	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
+
+	exynos_pcie_sideband_dbi_w_mode(exynos_pcie, true);
+	writel(val, pp->dbi_base + reg);
+	exynos_pcie_sideband_dbi_w_mode(exynos_pcie, false);
 }
 
 static int exynos_pcie_rd_own_conf(struct pcie_port *pp, int where, int size,
 				u32 *val)
 {
+	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 	int ret;
 
-	exynos_pcie_sideband_dbi_r_mode(pp, true);
+	exynos_pcie_sideband_dbi_r_mode(exynos_pcie, true);
 	ret = dw_pcie_cfg_read(pp->dbi_base + where, size, val);
-	exynos_pcie_sideband_dbi_r_mode(pp, false);
+	exynos_pcie_sideband_dbi_r_mode(exynos_pcie, false);
 	return ret;
 }
 
 static int exynos_pcie_wr_own_conf(struct pcie_port *pp, int where, int size,
 				u32 val)
 {
+	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
 	int ret;
 
-	exynos_pcie_sideband_dbi_w_mode(pp, true);
+	exynos_pcie_sideband_dbi_w_mode(exynos_pcie, true);
 	ret = dw_pcie_cfg_write(pp->dbi_base + where, size, val);
-	exynos_pcie_sideband_dbi_w_mode(pp, false);
+	exynos_pcie_sideband_dbi_w_mode(exynos_pcie, false);
 	return ret;
 }
 
 static int exynos_pcie_link_up(struct pcie_port *pp)
 {
 	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
-	u32 val = exynos_elb_readl(exynos_pcie, PCIE_ELBI_RDLH_LINKUP);
+	u32 val;
 
+	val = exynos_elb_readl(exynos_pcie, PCIE_ELBI_RDLH_LINKUP);
 	if (val == PCIE_ELBI_LTSSM_ENABLE)
 		return 1;
 
@@ -479,8 +460,10 @@ static int exynos_pcie_link_up(struct pcie_port *pp)
 
 static void exynos_pcie_host_init(struct pcie_port *pp)
 {
-	exynos_pcie_establish_link(pp);
-	exynos_pcie_enable_interrupts(pp);
+	struct exynos_pcie *exynos_pcie = to_exynos_pcie(pp);
+
+	exynos_pcie_establish_link(exynos_pcie);
+	exynos_pcie_enable_interrupts(exynos_pcie);
 }
 
 static struct pcie_host_ops exynos_pcie_host_ops = {
@@ -492,36 +475,38 @@ static struct pcie_host_ops exynos_pcie_host_ops = {
 	.host_init = exynos_pcie_host_init,
 };
 
-static int __init exynos_add_pcie_port(struct pcie_port *pp,
+static int __init exynos_add_pcie_port(struct exynos_pcie *exynos_pcie,
 				       struct platform_device *pdev)
 {
+	struct pcie_port *pp = &exynos_pcie->pp;
+	struct device *dev = pp->dev;
 	int ret;
 
 	pp->irq = platform_get_irq(pdev, 1);
 	if (!pp->irq) {
-		dev_err(&pdev->dev, "failed to get irq\n");
+		dev_err(dev, "failed to get irq\n");
 		return -ENODEV;
 	}
-	ret = devm_request_irq(&pdev->dev, pp->irq, exynos_pcie_irq_handler,
-				IRQF_SHARED, "exynos-pcie", pp);
+	ret = devm_request_irq(dev, pp->irq, exynos_pcie_irq_handler,
+				IRQF_SHARED, "exynos-pcie", exynos_pcie);
 	if (ret) {
-		dev_err(&pdev->dev, "failed to request irq\n");
+		dev_err(dev, "failed to request irq\n");
 		return ret;
 	}
 
 	if (IS_ENABLED(CONFIG_PCI_MSI)) {
 		pp->msi_irq = platform_get_irq(pdev, 0);
 		if (!pp->msi_irq) {
-			dev_err(&pdev->dev, "failed to get msi irq\n");
+			dev_err(dev, "failed to get msi irq\n");
 			return -ENODEV;
 		}
 
-		ret = devm_request_irq(&pdev->dev, pp->msi_irq,
+		ret = devm_request_irq(dev, pp->msi_irq,
 					exynos_pcie_msi_irq_handler,
 					IRQF_SHARED | IRQF_NO_THREAD,
-					"exynos-pcie", pp);
+					"exynos-pcie", exynos_pcie);
 		if (ret) {
-			dev_err(&pdev->dev, "failed to request msi irq\n");
+			dev_err(dev, "failed to request msi irq\n");
 			return ret;
 		}
 	}
@@ -531,7 +516,7 @@ static int __init exynos_add_pcie_port(struct pcie_port *pp,
 
 	ret = dw_pcie_host_init(pp);
 	if (ret) {
-		dev_err(&pdev->dev, "failed to initialize host\n");
+		dev_err(dev, "failed to initialize host\n");
 		return ret;
 	}
 
@@ -540,37 +525,36 @@ static int __init exynos_add_pcie_port(struct pcie_port *pp,
 
 static int __init exynos_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct exynos_pcie *exynos_pcie;
 	struct pcie_port *pp;
-	struct device_node *np = pdev->dev.of_node;
+	struct device_node *np = dev->of_node;
 	struct resource *elbi_base;
 	struct resource *phy_base;
 	struct resource *block_base;
 	int ret;
 
-	exynos_pcie = devm_kzalloc(&pdev->dev, sizeof(*exynos_pcie),
-				GFP_KERNEL);
+	exynos_pcie = devm_kzalloc(dev, sizeof(*exynos_pcie), GFP_KERNEL);
 	if (!exynos_pcie)
 		return -ENOMEM;
 
 	pp = &exynos_pcie->pp;
-
-	pp->dev = &pdev->dev;
+	pp->dev = dev;
 
 	exynos_pcie->reset_gpio = of_get_named_gpio(np, "reset-gpio", 0);
 
-	exynos_pcie->clk = devm_clk_get(&pdev->dev, "pcie");
+	exynos_pcie->clk = devm_clk_get(dev, "pcie");
 	if (IS_ERR(exynos_pcie->clk)) {
-		dev_err(&pdev->dev, "Failed to get pcie rc clock\n");
+		dev_err(dev, "Failed to get pcie rc clock\n");
 		return PTR_ERR(exynos_pcie->clk);
 	}
 	ret = clk_prepare_enable(exynos_pcie->clk);
 	if (ret)
 		return ret;
 
-	exynos_pcie->bus_clk = devm_clk_get(&pdev->dev, "pcie_bus");
+	exynos_pcie->bus_clk = devm_clk_get(dev, "pcie_bus");
 	if (IS_ERR(exynos_pcie->bus_clk)) {
-		dev_err(&pdev->dev, "Failed to get pcie bus clock\n");
+		dev_err(dev, "Failed to get pcie bus clock\n");
 		ret = PTR_ERR(exynos_pcie->bus_clk);
 		goto fail_clk;
 	}
@@ -579,27 +563,27 @@ static int __init exynos_pcie_probe(struct platform_device *pdev)
 		goto fail_clk;
 
 	elbi_base = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	exynos_pcie->elbi_base = devm_ioremap_resource(&pdev->dev, elbi_base);
+	exynos_pcie->elbi_base = devm_ioremap_resource(dev, elbi_base);
 	if (IS_ERR(exynos_pcie->elbi_base)) {
 		ret = PTR_ERR(exynos_pcie->elbi_base);
 		goto fail_bus_clk;
 	}
 
 	phy_base = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-	exynos_pcie->phy_base = devm_ioremap_resource(&pdev->dev, phy_base);
+	exynos_pcie->phy_base = devm_ioremap_resource(dev, phy_base);
 	if (IS_ERR(exynos_pcie->phy_base)) {
 		ret = PTR_ERR(exynos_pcie->phy_base);
 		goto fail_bus_clk;
 	}
 
 	block_base = platform_get_resource(pdev, IORESOURCE_MEM, 2);
-	exynos_pcie->block_base = devm_ioremap_resource(&pdev->dev, block_base);
+	exynos_pcie->block_base = devm_ioremap_resource(dev, block_base);
 	if (IS_ERR(exynos_pcie->block_base)) {
 		ret = PTR_ERR(exynos_pcie->block_base);
 		goto fail_bus_clk;
 	}
 
-	ret = exynos_add_pcie_port(pp, pdev);
+	ret = exynos_add_pcie_port(exynos_pcie, pdev);
 	if (ret < 0)
 		goto fail_bus_clk;
 
diff --git a/drivers/pci/host/pci-imx6.c b/drivers/pci/host/pci-imx6.c
index ead4a5c3480b..c8cefb078218 100644
--- a/drivers/pci/host/pci-imx6.c
+++ b/drivers/pci/host/pci-imx6.c
@@ -39,16 +39,15 @@ enum imx6_pcie_variants {
 };
 
 struct imx6_pcie {
+	struct pcie_port	pp;	/* pp.dbi_base is DT 0th resource */
 	int			reset_gpio;
 	bool			gpio_active_high;
 	struct clk		*pcie_bus;
 	struct clk		*pcie_phy;
 	struct clk		*pcie_inbound_axi;
 	struct clk		*pcie;
-	struct pcie_port	pp;
 	struct regmap		*iomuxc_gpr;
 	enum imx6_pcie_variants variant;
-	void __iomem		*mem_base;
 	u32			tx_deemph_gen1;
 	u32			tx_deemph_gen2_3p5db;
 	u32			tx_deemph_gen2_6db;
@@ -96,14 +95,15 @@ struct imx6_pcie {
 #define PHY_RX_OVRD_IN_LO_RX_DATA_EN (1 << 5)
 #define PHY_RX_OVRD_IN_LO_RX_PLL_EN (1 << 3)
 
-static int pcie_phy_poll_ack(void __iomem *dbi_base, int exp_val)
+static int pcie_phy_poll_ack(struct imx6_pcie *imx6_pcie, int exp_val)
 {
+	struct pcie_port *pp = &imx6_pcie->pp;
 	u32 val;
 	u32 max_iterations = 10;
 	u32 wait_counter = 0;
 
 	do {
-		val = readl(dbi_base + PCIE_PHY_STAT);
+		val = dw_pcie_readl_rc(pp, PCIE_PHY_STAT);
 		val = (val >> PCIE_PHY_STAT_ACK_LOC) & 0x1;
 		wait_counter++;
 
@@ -116,123 +116,126 @@ static int pcie_phy_poll_ack(void __iomem *dbi_base, int exp_val)
 	return -ETIMEDOUT;
 }
 
-static int pcie_phy_wait_ack(void __iomem *dbi_base, int addr)
+static int pcie_phy_wait_ack(struct imx6_pcie *imx6_pcie, int addr)
 {
+	struct pcie_port *pp = &imx6_pcie->pp;
 	u32 val;
 	int ret;
 
 	val = addr << PCIE_PHY_CTRL_DATA_LOC;
-	writel(val, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, val);
 
 	val |= (0x1 << PCIE_PHY_CTRL_CAP_ADR_LOC);
-	writel(val, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, val);
 
-	ret = pcie_phy_poll_ack(dbi_base, 1);
+	ret = pcie_phy_poll_ack(imx6_pcie, 1);
 	if (ret)
 		return ret;
 
 	val = addr << PCIE_PHY_CTRL_DATA_LOC;
-	writel(val, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, val);
 
-	return pcie_phy_poll_ack(dbi_base, 0);
+	return pcie_phy_poll_ack(imx6_pcie, 0);
 }
 
 /* Read from the 16-bit PCIe PHY control registers (not memory-mapped) */
-static int pcie_phy_read(void __iomem *dbi_base, int addr, int *data)
+static int pcie_phy_read(struct imx6_pcie *imx6_pcie, int addr, int *data)
 {
+	struct pcie_port *pp = &imx6_pcie->pp;
 	u32 val, phy_ctl;
 	int ret;
 
-	ret = pcie_phy_wait_ack(dbi_base, addr);
+	ret = pcie_phy_wait_ack(imx6_pcie, addr);
 	if (ret)
 		return ret;
 
 	/* assert Read signal */
 	phy_ctl = 0x1 << PCIE_PHY_CTRL_RD_LOC;
-	writel(phy_ctl, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, phy_ctl);
 
-	ret = pcie_phy_poll_ack(dbi_base, 1);
+	ret = pcie_phy_poll_ack(imx6_pcie, 1);
 	if (ret)
 		return ret;
 
-	val = readl(dbi_base + PCIE_PHY_STAT);
+	val = dw_pcie_readl_rc(pp, PCIE_PHY_STAT);
 	*data = val & 0xffff;
 
 	/* deassert Read signal */
-	writel(0x00, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, 0x00);
 
-	return pcie_phy_poll_ack(dbi_base, 0);
+	return pcie_phy_poll_ack(imx6_pcie, 0);
 }
 
-static int pcie_phy_write(void __iomem *dbi_base, int addr, int data)
+static int pcie_phy_write(struct imx6_pcie *imx6_pcie, int addr, int data)
 {
+	struct pcie_port *pp = &imx6_pcie->pp;
 	u32 var;
 	int ret;
 
 	/* write addr */
 	/* cap addr */
-	ret = pcie_phy_wait_ack(dbi_base, addr);
+	ret = pcie_phy_wait_ack(imx6_pcie, addr);
 	if (ret)
 		return ret;
 
 	var = data << PCIE_PHY_CTRL_DATA_LOC;
-	writel(var, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, var);
 
 	/* capture data */
 	var |= (0x1 << PCIE_PHY_CTRL_CAP_DAT_LOC);
-	writel(var, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, var);
 
-	ret = pcie_phy_poll_ack(dbi_base, 1);
+	ret = pcie_phy_poll_ack(imx6_pcie, 1);
 	if (ret)
 		return ret;
 
 	/* deassert cap data */
 	var = data << PCIE_PHY_CTRL_DATA_LOC;
-	writel(var, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, var);
 
 	/* wait for ack de-assertion */
-	ret = pcie_phy_poll_ack(dbi_base, 0);
+	ret = pcie_phy_poll_ack(imx6_pcie, 0);
 	if (ret)
 		return ret;
 
 	/* assert wr signal */
 	var = 0x1 << PCIE_PHY_CTRL_WR_LOC;
-	writel(var, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, var);
 
 	/* wait for ack */
-	ret = pcie_phy_poll_ack(dbi_base, 1);
+	ret = pcie_phy_poll_ack(imx6_pcie, 1);
 	if (ret)
 		return ret;
 
 	/* deassert wr signal */
 	var = data << PCIE_PHY_CTRL_DATA_LOC;
-	writel(var, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, var);
 
 	/* wait for ack de-assertion */
-	ret = pcie_phy_poll_ack(dbi_base, 0);
+	ret = pcie_phy_poll_ack(imx6_pcie, 0);
 	if (ret)
 		return ret;
 
-	writel(0x0, dbi_base + PCIE_PHY_CTRL);
+	dw_pcie_writel_rc(pp, PCIE_PHY_CTRL, 0x0);
 
 	return 0;
 }
 
-static void imx6_pcie_reset_phy(struct pcie_port *pp)
+static void imx6_pcie_reset_phy(struct imx6_pcie *imx6_pcie)
 {
 	u32 tmp;
 
-	pcie_phy_read(pp->dbi_base, PHY_RX_OVRD_IN_LO, &tmp);
+	pcie_phy_read(imx6_pcie, PHY_RX_OVRD_IN_LO, &tmp);
 	tmp |= (PHY_RX_OVRD_IN_LO_RX_DATA_EN |
 		PHY_RX_OVRD_IN_LO_RX_PLL_EN);
-	pcie_phy_write(pp->dbi_base, PHY_RX_OVRD_IN_LO, tmp);
+	pcie_phy_write(imx6_pcie, PHY_RX_OVRD_IN_LO, tmp);
 
 	usleep_range(2000, 3000);
 
-	pcie_phy_read(pp->dbi_base, PHY_RX_OVRD_IN_LO, &tmp);
+	pcie_phy_read(imx6_pcie, PHY_RX_OVRD_IN_LO, &tmp);
 	tmp &= ~(PHY_RX_OVRD_IN_LO_RX_DATA_EN |
 		  PHY_RX_OVRD_IN_LO_RX_PLL_EN);
-	pcie_phy_write(pp->dbi_base, PHY_RX_OVRD_IN_LO, tmp);
+	pcie_phy_write(imx6_pcie, PHY_RX_OVRD_IN_LO, tmp);
 }
 
 /*  Added for PCI abort handling */
@@ -242,9 +245,9 @@ static int imx6q_pcie_abort_handler(unsigned long addr,
 	return 0;
 }
 
-static int imx6_pcie_assert_core_reset(struct pcie_port *pp)
+static void imx6_pcie_assert_core_reset(struct imx6_pcie *imx6_pcie)
 {
-	struct imx6_pcie *imx6_pcie = to_imx6_pcie(pp);
+	struct pcie_port *pp = &imx6_pcie->pp;
 	u32 val, gpr1, gpr12;
 
 	switch (imx6_pcie->variant) {
@@ -281,10 +284,10 @@ static int imx6_pcie_assert_core_reset(struct pcie_port *pp)
 
 		if ((gpr1 & IMX6Q_GPR1_PCIE_REF_CLK_EN) &&
 		    (gpr12 & IMX6Q_GPR12_PCIE_CTL_2)) {
-			val = readl(pp->dbi_base + PCIE_PL_PFLR);
+			val = dw_pcie_readl_rc(pp, PCIE_PL_PFLR);
 			val &= ~PCIE_PL_PFLR_LINK_STATE_MASK;
 			val |= PCIE_PL_PFLR_FORCE_LINK;
-			writel(val, pp->dbi_base + PCIE_PL_PFLR);
+			dw_pcie_writel_rc(pp, PCIE_PL_PFLR, val);
 
 			regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12,
 					   IMX6Q_GPR12_PCIE_CTL_2, 0 << 10);
@@ -296,20 +299,19 @@ static int imx6_pcie_assert_core_reset(struct pcie_port *pp)
 				   IMX6Q_GPR1_PCIE_REF_CLK_EN, 0 << 16);
 		break;
 	}
-
-	return 0;
 }
 
 static int imx6_pcie_enable_ref_clk(struct imx6_pcie *imx6_pcie)
 {
 	struct pcie_port *pp = &imx6_pcie->pp;
+	struct device *dev = pp->dev;
 	int ret = 0;
 
 	switch (imx6_pcie->variant) {
 	case IMX6SX:
 		ret = clk_prepare_enable(imx6_pcie->pcie_inbound_axi);
 		if (ret) {
-			dev_err(pp->dev, "unable to enable pcie_axi clock\n");
+			dev_err(dev, "unable to enable pcie_axi clock\n");
 			break;
 		}
 
@@ -336,32 +338,33 @@ static int imx6_pcie_enable_ref_clk(struct imx6_pcie *imx6_pcie)
 	return ret;
 }
 
-static int imx6_pcie_deassert_core_reset(struct pcie_port *pp)
+static void imx6_pcie_deassert_core_reset(struct imx6_pcie *imx6_pcie)
 {
-	struct imx6_pcie *imx6_pcie = to_imx6_pcie(pp);
+	struct pcie_port *pp = &imx6_pcie->pp;
+	struct device *dev = pp->dev;
 	int ret;
 
 	ret = clk_prepare_enable(imx6_pcie->pcie_phy);
 	if (ret) {
-		dev_err(pp->dev, "unable to enable pcie_phy clock\n");
-		goto err_pcie_phy;
+		dev_err(dev, "unable to enable pcie_phy clock\n");
+		return;
 	}
 
 	ret = clk_prepare_enable(imx6_pcie->pcie_bus);
 	if (ret) {
-		dev_err(pp->dev, "unable to enable pcie_bus clock\n");
+		dev_err(dev, "unable to enable pcie_bus clock\n");
 		goto err_pcie_bus;
 	}
 
 	ret = clk_prepare_enable(imx6_pcie->pcie);
 	if (ret) {
-		dev_err(pp->dev, "unable to enable pcie clock\n");
+		dev_err(dev, "unable to enable pcie clock\n");
 		goto err_pcie;
 	}
 
 	ret = imx6_pcie_enable_ref_clk(imx6_pcie);
 	if (ret) {
-		dev_err(pp->dev, "unable to enable pcie ref clock\n");
+		dev_err(dev, "unable to enable pcie ref clock\n");
 		goto err_ref_clk;
 	}
 
@@ -392,7 +395,7 @@ static int imx6_pcie_deassert_core_reset(struct pcie_port *pp)
 		break;
 	}
 
-	return 0;
+	return;
 
 err_ref_clk:
 	clk_disable_unprepare(imx6_pcie->pcie);
@@ -400,14 +403,10 @@ err_pcie:
 	clk_disable_unprepare(imx6_pcie->pcie_bus);
 err_pcie_bus:
 	clk_disable_unprepare(imx6_pcie->pcie_phy);
-err_pcie_phy:
-	return ret;
 }
 
-static void imx6_pcie_init_phy(struct pcie_port *pp)
+static void imx6_pcie_init_phy(struct imx6_pcie *imx6_pcie)
 {
-	struct imx6_pcie *imx6_pcie = to_imx6_pcie(pp);
-
 	if (imx6_pcie->variant == IMX6SX)
 		regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12,
 				   IMX6SX_GPR12_PCIE_RX_EQ_MASK,
@@ -439,45 +438,52 @@ static void imx6_pcie_init_phy(struct pcie_port *pp)
 			   imx6_pcie->tx_swing_low << 25);
 }
 
-static int imx6_pcie_wait_for_link(struct pcie_port *pp)
+static int imx6_pcie_wait_for_link(struct imx6_pcie *imx6_pcie)
 {
+	struct pcie_port *pp = &imx6_pcie->pp;
+	struct device *dev = pp->dev;
+
 	/* check if the link is up or not */
 	if (!dw_pcie_wait_for_link(pp))
 		return 0;
 
-	dev_dbg(pp->dev, "DEBUG_R0: 0x%08x, DEBUG_R1: 0x%08x\n",
-		readl(pp->dbi_base + PCIE_PHY_DEBUG_R0),
-		readl(pp->dbi_base + PCIE_PHY_DEBUG_R1));
+	dev_dbg(dev, "DEBUG_R0: 0x%08x, DEBUG_R1: 0x%08x\n",
+		dw_pcie_readl_rc(pp, PCIE_PHY_DEBUG_R0),
+		dw_pcie_readl_rc(pp, PCIE_PHY_DEBUG_R1));
 	return -ETIMEDOUT;
 }
 
-static int imx6_pcie_wait_for_speed_change(struct pcie_port *pp)
+static int imx6_pcie_wait_for_speed_change(struct imx6_pcie *imx6_pcie)
 {
+	struct pcie_port *pp = &imx6_pcie->pp;
+	struct device *dev = pp->dev;
 	u32 tmp;
 	unsigned int retries;
 
 	for (retries = 0; retries < 200; retries++) {
-		tmp = readl(pp->dbi_base + PCIE_LINK_WIDTH_SPEED_CONTROL);
+		tmp = dw_pcie_readl_rc(pp, PCIE_LINK_WIDTH_SPEED_CONTROL);
 		/* Test if the speed change finished. */
 		if (!(tmp & PORT_LOGIC_SPEED_CHANGE))
 			return 0;
 		usleep_range(100, 1000);
 	}
 
-	dev_err(pp->dev, "Speed change timeout\n");
+	dev_err(dev, "Speed change timeout\n");
 	return -EINVAL;
 }
 
 static irqreturn_t imx6_pcie_msi_handler(int irq, void *arg)
 {
-	struct pcie_port *pp = arg;
+	struct imx6_pcie *imx6_pcie = arg;
+	struct pcie_port *pp = &imx6_pcie->pp;
 
 	return dw_handle_msi_irq(pp);
 }
 
-static int imx6_pcie_establish_link(struct pcie_port *pp)
+static int imx6_pcie_establish_link(struct imx6_pcie *imx6_pcie)
 {
-	struct imx6_pcie *imx6_pcie = to_imx6_pcie(pp);
+	struct pcie_port *pp = &imx6_pcie->pp;
+	struct device *dev = pp->dev;
 	u32 tmp;
 	int ret;
 
@@ -486,76 +492,73 @@ static int imx6_pcie_establish_link(struct pcie_port *pp)
 	 * started in Gen2 mode, there is a possibility the devices on the
 	 * bus will not be detected at all.  This happens with PCIe switches.
 	 */
-	tmp = readl(pp->dbi_base + PCIE_RC_LCR);
+	tmp = dw_pcie_readl_rc(pp, PCIE_RC_LCR);
 	tmp &= ~PCIE_RC_LCR_MAX_LINK_SPEEDS_MASK;
 	tmp |= PCIE_RC_LCR_MAX_LINK_SPEEDS_GEN1;
-	writel(tmp, pp->dbi_base + PCIE_RC_LCR);
+	dw_pcie_writel_rc(pp, PCIE_RC_LCR, tmp);
 
 	/* Start LTSSM. */
 	regmap_update_bits(imx6_pcie->iomuxc_gpr, IOMUXC_GPR12,
 			IMX6Q_GPR12_PCIE_CTL_2, 1 << 10);
 
-	ret = imx6_pcie_wait_for_link(pp);
+	ret = imx6_pcie_wait_for_link(imx6_pcie);
 	if (ret) {
-		dev_info(pp->dev, "Link never came up\n");
+		dev_info(dev, "Link never came up\n");
 		goto err_reset_phy;
 	}
 
 	if (imx6_pcie->link_gen == 2) {
 		/* Allow Gen2 mode after the link is up. */
-		tmp = readl(pp->dbi_base + PCIE_RC_LCR);
+		tmp = dw_pcie_readl_rc(pp, PCIE_RC_LCR);
 		tmp &= ~PCIE_RC_LCR_MAX_LINK_SPEEDS_MASK;
 		tmp |= PCIE_RC_LCR_MAX_LINK_SPEEDS_GEN2;
-		writel(tmp, pp->dbi_base + PCIE_RC_LCR);
+		dw_pcie_writel_rc(pp, PCIE_RC_LCR, tmp);
 	} else {
-		dev_info(pp->dev, "Link: Gen2 disabled\n");
+		dev_info(dev, "Link: Gen2 disabled\n");
 	}
 
 	/*
 	 * Start Directed Speed Change so the best possible speed both link
 	 * partners support can be negotiated.
 	 */
-	tmp = readl(pp->dbi_base + PCIE_LINK_WIDTH_SPEED_CONTROL);
+	tmp = dw_pcie_readl_rc(pp, PCIE_LINK_WIDTH_SPEED_CONTROL);
 	tmp |= PORT_LOGIC_SPEED_CHANGE;
-	writel(tmp, pp->dbi_base + PCIE_LINK_WIDTH_SPEED_CONTROL);
+	dw_pcie_writel_rc(pp, PCIE_LINK_WIDTH_SPEED_CONTROL, tmp);
 
-	ret = imx6_pcie_wait_for_speed_change(pp);
+	ret = imx6_pcie_wait_for_speed_change(imx6_pcie);
 	if (ret) {
-		dev_err(pp->dev, "Failed to bring link up!\n");
+		dev_err(dev, "Failed to bring link up!\n");
 		goto err_reset_phy;
 	}
 
 	/* Make sure link training is finished as well! */
-	ret = imx6_pcie_wait_for_link(pp);
+	ret = imx6_pcie_wait_for_link(imx6_pcie);
 	if (ret) {
-		dev_err(pp->dev, "Failed to bring link up!\n");
+		dev_err(dev, "Failed to bring link up!\n");
 		goto err_reset_phy;
 	}
 
-	tmp = readl(pp->dbi_base + PCIE_RC_LCSR);
-	dev_info(pp->dev, "Link up, Gen%i\n", (tmp >> 16) & 0xf);
+	tmp = dw_pcie_readl_rc(pp, PCIE_RC_LCSR);
+	dev_info(dev, "Link up, Gen%i\n", (tmp >> 16) & 0xf);
 	return 0;
 
 err_reset_phy:
-	dev_dbg(pp->dev, "PHY DEBUG_R0=0x%08x DEBUG_R1=0x%08x\n",
-		readl(pp->dbi_base + PCIE_PHY_DEBUG_R0),
-		readl(pp->dbi_base + PCIE_PHY_DEBUG_R1));
-	imx6_pcie_reset_phy(pp);
-
+	dev_dbg(dev, "PHY DEBUG_R0=0x%08x DEBUG_R1=0x%08x\n",
+		dw_pcie_readl_rc(pp, PCIE_PHY_DEBUG_R0),
+		dw_pcie_readl_rc(pp, PCIE_PHY_DEBUG_R1));
+	imx6_pcie_reset_phy(imx6_pcie);
 	return ret;
 }
 
 static void imx6_pcie_host_init(struct pcie_port *pp)
 {
-	imx6_pcie_assert_core_reset(pp);
-
-	imx6_pcie_init_phy(pp);
-
-	imx6_pcie_deassert_core_reset(pp);
+	struct imx6_pcie *imx6_pcie = to_imx6_pcie(pp);
 
+	imx6_pcie_assert_core_reset(imx6_pcie);
+	imx6_pcie_init_phy(imx6_pcie);
+	imx6_pcie_deassert_core_reset(imx6_pcie);
 	dw_pcie_setup_rc(pp);
-
-	imx6_pcie_establish_link(pp);
+	imx6_pcie_establish_link(imx6_pcie);
 
 	if (IS_ENABLED(CONFIG_PCI_MSI))
 		dw_pcie_msi_init(pp);
@@ -563,7 +566,7 @@ static void imx6_pcie_host_init(struct pcie_port *pp)
 
 static int imx6_pcie_link_up(struct pcie_port *pp)
 {
-	return readl(pp->dbi_base + PCIE_PHY_DEBUG_R1) &
+	return dw_pcie_readl_rc(pp, PCIE_PHY_DEBUG_R1) &
 			PCIE_PHY_DEBUG_R1_XMLH_LINK_UP;
 }
 
@@ -572,24 +575,26 @@ static struct pcie_host_ops imx6_pcie_host_ops = {
 	.host_init = imx6_pcie_host_init,
 };
 
-static int __init imx6_add_pcie_port(struct pcie_port *pp,
-			struct platform_device *pdev)
+static int __init imx6_add_pcie_port(struct imx6_pcie *imx6_pcie,
+				     struct platform_device *pdev)
 {
+	struct pcie_port *pp = &imx6_pcie->pp;
+	struct device *dev = pp->dev;
 	int ret;
 
 	if (IS_ENABLED(CONFIG_PCI_MSI)) {
 		pp->msi_irq = platform_get_irq_byname(pdev, "msi");
 		if (pp->msi_irq <= 0) {
-			dev_err(&pdev->dev, "failed to get MSI irq\n");
+			dev_err(dev, "failed to get MSI irq\n");
 			return -ENODEV;
 		}
 
-		ret = devm_request_irq(&pdev->dev, pp->msi_irq,
+		ret = devm_request_irq(dev, pp->msi_irq,
 				       imx6_pcie_msi_handler,
 				       IRQF_SHARED | IRQF_NO_THREAD,
-				       "mx6-pcie-msi", pp);
+				       "mx6-pcie-msi", imx6_pcie);
 		if (ret) {
-			dev_err(&pdev->dev, "failed to request MSI irq\n");
+			dev_err(dev, "failed to request MSI irq\n");
 			return ret;
 		}
 	}
@@ -599,7 +604,7 @@ static int __init imx6_add_pcie_port(struct pcie_port *pp,
 
 	ret = dw_pcie_host_init(pp);
 	if (ret) {
-		dev_err(&pdev->dev, "failed to initialize host\n");
+		dev_err(dev, "failed to initialize host\n");
 		return ret;
 	}
 
@@ -608,75 +613,72 @@ static int __init imx6_add_pcie_port(struct pcie_port *pp,
 
 static int __init imx6_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct imx6_pcie *imx6_pcie;
 	struct pcie_port *pp;
-	struct device_node *np = pdev->dev.of_node;
 	struct resource *dbi_base;
-	struct device_node *node = pdev->dev.of_node;
+	struct device_node *node = dev->of_node;
 	int ret;
 
-	imx6_pcie = devm_kzalloc(&pdev->dev, sizeof(*imx6_pcie), GFP_KERNEL);
+	imx6_pcie = devm_kzalloc(dev, sizeof(*imx6_pcie), GFP_KERNEL);
 	if (!imx6_pcie)
 		return -ENOMEM;
 
 	pp = &imx6_pcie->pp;
-	pp->dev = &pdev->dev;
+	pp->dev = dev;
 
 	imx6_pcie->variant =
-		(enum imx6_pcie_variants)of_device_get_match_data(&pdev->dev);
+		(enum imx6_pcie_variants)of_device_get_match_data(dev);
 
 	/* Added for PCI abort handling */
 	hook_fault_code(16 + 6, imx6q_pcie_abort_handler, SIGBUS, 0,
 		"imprecise external abort");
 
 	dbi_base = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	pp->dbi_base = devm_ioremap_resource(&pdev->dev, dbi_base);
+	pp->dbi_base = devm_ioremap_resource(dev, dbi_base);
 	if (IS_ERR(pp->dbi_base))
 		return PTR_ERR(pp->dbi_base);
 
 	/* Fetch GPIOs */
-	imx6_pcie->reset_gpio = of_get_named_gpio(np, "reset-gpio", 0);
-	imx6_pcie->gpio_active_high = of_property_read_bool(np,
+	imx6_pcie->reset_gpio = of_get_named_gpio(node, "reset-gpio", 0);
+	imx6_pcie->gpio_active_high = of_property_read_bool(node,
 						"reset-gpio-active-high");
 	if (gpio_is_valid(imx6_pcie->reset_gpio)) {
-		ret = devm_gpio_request_one(&pdev->dev, imx6_pcie->reset_gpio,
+		ret = devm_gpio_request_one(dev, imx6_pcie->reset_gpio,
 				imx6_pcie->gpio_active_high ?
 					GPIOF_OUT_INIT_HIGH :
 					GPIOF_OUT_INIT_LOW,
 				"PCIe reset");
 		if (ret) {
-			dev_err(&pdev->dev, "unable to get reset gpio\n");
+			dev_err(dev, "unable to get reset gpio\n");
 			return ret;
 		}
 	}
 
 	/* Fetch clocks */
-	imx6_pcie->pcie_phy = devm_clk_get(&pdev->dev, "pcie_phy");
+	imx6_pcie->pcie_phy = devm_clk_get(dev, "pcie_phy");
 	if (IS_ERR(imx6_pcie->pcie_phy)) {
-		dev_err(&pdev->dev,
-			"pcie_phy clock source missing or invalid\n");
+		dev_err(dev, "pcie_phy clock source missing or invalid\n");
 		return PTR_ERR(imx6_pcie->pcie_phy);
 	}
 
-	imx6_pcie->pcie_bus = devm_clk_get(&pdev->dev, "pcie_bus");
+	imx6_pcie->pcie_bus = devm_clk_get(dev, "pcie_bus");
 	if (IS_ERR(imx6_pcie->pcie_bus)) {
-		dev_err(&pdev->dev,
-			"pcie_bus clock source missing or invalid\n");
+		dev_err(dev, "pcie_bus clock source missing or invalid\n");
 		return PTR_ERR(imx6_pcie->pcie_bus);
 	}
 
-	imx6_pcie->pcie = devm_clk_get(&pdev->dev, "pcie");
+	imx6_pcie->pcie = devm_clk_get(dev, "pcie");
 	if (IS_ERR(imx6_pcie->pcie)) {
-		dev_err(&pdev->dev,
-			"pcie clock source missing or invalid\n");
+		dev_err(dev, "pcie clock source missing or invalid\n");
 		return PTR_ERR(imx6_pcie->pcie);
 	}
 
 	if (imx6_pcie->variant == IMX6SX) {
-		imx6_pcie->pcie_inbound_axi = devm_clk_get(&pdev->dev,
+		imx6_pcie->pcie_inbound_axi = devm_clk_get(dev,
 							   "pcie_inbound_axi");
 		if (IS_ERR(imx6_pcie->pcie_inbound_axi)) {
-			dev_err(&pdev->dev,
+			dev_err(dev,
 				"pcie_incbound_axi clock missing or invalid\n");
 			return PTR_ERR(imx6_pcie->pcie_inbound_axi);
 		}
@@ -686,7 +688,7 @@ static int __init imx6_pcie_probe(struct platform_device *pdev)
 	imx6_pcie->iomuxc_gpr =
 		 syscon_regmap_lookup_by_compatible("fsl,imx6q-iomuxc-gpr");
 	if (IS_ERR(imx6_pcie->iomuxc_gpr)) {
-		dev_err(&pdev->dev, "unable to find iomuxc registers\n");
+		dev_err(dev, "unable to find iomuxc registers\n");
 		return PTR_ERR(imx6_pcie->iomuxc_gpr);
 	}
 
@@ -712,12 +714,12 @@ static int __init imx6_pcie_probe(struct platform_device *pdev)
 		imx6_pcie->tx_swing_low = 127;
 
 	/* Limit link speed */
-	ret = of_property_read_u32(pp->dev->of_node, "fsl,max-link-speed",
+	ret = of_property_read_u32(node, "fsl,max-link-speed",
 				   &imx6_pcie->link_gen);
 	if (ret)
 		imx6_pcie->link_gen = 1;
 
-	ret = imx6_add_pcie_port(pp, pdev);
+	ret = imx6_add_pcie_port(imx6_pcie, pdev);
 	if (ret < 0)
 		return ret;
 
@@ -730,7 +732,7 @@ static void imx6_pcie_shutdown(struct platform_device *pdev)
 	struct imx6_pcie *imx6_pcie = platform_get_drvdata(pdev);
 
 	/* bring down link, so bootloader gets clean state in case of reboot */
-	imx6_pcie_assert_core_reset(&imx6_pcie->pp);
+	imx6_pcie_assert_core_reset(imx6_pcie);
 }
 
 static const struct of_device_id imx6_pcie_of_match[] = {
diff --git a/drivers/pci/host/pci-keystone-dw.c b/drivers/pci/host/pci-keystone-dw.c
index 41515092eb0d..9397c4667106 100644
--- a/drivers/pci/host/pci-keystone-dw.c
+++ b/drivers/pci/host/pci-keystone-dw.c
@@ -88,13 +88,24 @@ phys_addr_t ks_dw_pcie_get_msi_addr(struct pcie_port *pp)
 	return ks_pcie->app.start + MSI_IRQ;
 }
 
+static u32 ks_dw_app_readl(struct keystone_pcie *ks_pcie, u32 offset)
+{
+	return readl(ks_pcie->va_app_base + offset);
+}
+
+static void ks_dw_app_writel(struct keystone_pcie *ks_pcie, u32 offset, u32 val)
+{
+	writel(val, ks_pcie->va_app_base + offset);
+}
+
 void ks_dw_pcie_handle_msi_irq(struct keystone_pcie *ks_pcie, int offset)
 {
 	struct pcie_port *pp = &ks_pcie->pp;
+	struct device *dev = pp->dev;
 	u32 pending, vector;
 	int src, virq;
 
-	pending = readl(ks_pcie->va_app_base + MSI0_IRQ_STATUS + (offset << 4));
+	pending = ks_dw_app_readl(ks_pcie, MSI0_IRQ_STATUS + (offset << 4));
 
 	/*
 	 * MSI0 status bit 0-3 shows vectors 0, 8, 16, 24, MSI1 status bit
@@ -104,7 +115,7 @@ void ks_dw_pcie_handle_msi_irq(struct keystone_pcie *ks_pcie, int offset)
 		if (BIT(src) & pending) {
 			vector = offset + (src << 3);
 			virq = irq_linear_revmap(pp->irq_domain, vector);
-			dev_dbg(pp->dev, "irq: bit %d, vector %d, virq %d\n",
+			dev_dbg(dev, "irq: bit %d, vector %d, virq %d\n",
 				src, vector, virq);
 			generic_handle_irq(virq);
 		}
@@ -124,9 +135,9 @@ static void ks_dw_pcie_msi_irq_ack(struct irq_data *d)
 	offset = d->irq - irq_linear_revmap(pp->irq_domain, 0);
 	update_reg_offset_bit_pos(offset, &reg_offset, &bit_pos);
 
-	writel(BIT(bit_pos),
-	       ks_pcie->va_app_base + MSI0_IRQ_STATUS + (reg_offset << 4));
-	writel(reg_offset + MSI_IRQ_OFFSET, ks_pcie->va_app_base + IRQ_EOI);
+	ks_dw_app_writel(ks_pcie, MSI0_IRQ_STATUS + (reg_offset << 4),
+			 BIT(bit_pos));
+	ks_dw_app_writel(ks_pcie, IRQ_EOI, reg_offset + MSI_IRQ_OFFSET);
 }
 
 void ks_dw_pcie_msi_set_irq(struct pcie_port *pp, int irq)
@@ -135,8 +146,8 @@ void ks_dw_pcie_msi_set_irq(struct pcie_port *pp, int irq)
 	struct keystone_pcie *ks_pcie = to_keystone_pcie(pp);
 
 	update_reg_offset_bit_pos(irq, &reg_offset, &bit_pos);
-	writel(BIT(bit_pos),
-	       ks_pcie->va_app_base + MSI0_IRQ_ENABLE_SET + (reg_offset << 4));
+	ks_dw_app_writel(ks_pcie, MSI0_IRQ_ENABLE_SET + (reg_offset << 4),
+			 BIT(bit_pos));
 }
 
 void ks_dw_pcie_msi_clear_irq(struct pcie_port *pp, int irq)
@@ -145,8 +156,8 @@ void ks_dw_pcie_msi_clear_irq(struct pcie_port *pp, int irq)
 	struct keystone_pcie *ks_pcie = to_keystone_pcie(pp);
 
 	update_reg_offset_bit_pos(irq, &reg_offset, &bit_pos);
-	writel(BIT(bit_pos),
-	       ks_pcie->va_app_base + MSI0_IRQ_ENABLE_CLR + (reg_offset << 4));
+	ks_dw_app_writel(ks_pcie, MSI0_IRQ_ENABLE_CLR + (reg_offset << 4),
+			 BIT(bit_pos));
 }
 
 static void ks_dw_pcie_msi_irq_mask(struct irq_data *d)
@@ -215,6 +226,7 @@ static const struct irq_domain_ops ks_dw_pcie_msi_domain_ops = {
 int ks_dw_pcie_msi_host_init(struct pcie_port *pp, struct msi_controller *chip)
 {
 	struct keystone_pcie *ks_pcie = to_keystone_pcie(pp);
+	struct device *dev = pp->dev;
 	int i;
 
 	pp->irq_domain = irq_domain_add_linear(ks_pcie->msi_intc_np,
@@ -222,7 +234,7 @@ int ks_dw_pcie_msi_host_init(struct pcie_port *pp, struct msi_controller *chip)
 					&ks_dw_pcie_msi_domain_ops,
 					chip);
 	if (!pp->irq_domain) {
-		dev_err(pp->dev, "irq domain init failed\n");
+		dev_err(dev, "irq domain init failed\n");
 		return -ENXIO;
 	}
 
@@ -237,47 +249,47 @@ void ks_dw_pcie_enable_legacy_irqs(struct keystone_pcie *ks_pcie)
 	int i;
 
 	for (i = 0; i < MAX_LEGACY_IRQS; i++)
-		writel(0x1, ks_pcie->va_app_base + IRQ_ENABLE_SET + (i << 4));
+		ks_dw_app_writel(ks_pcie, IRQ_ENABLE_SET + (i << 4), 0x1);
 }
 
 void ks_dw_pcie_handle_legacy_irq(struct keystone_pcie *ks_pcie, int offset)
 {
 	struct pcie_port *pp = &ks_pcie->pp;
+	struct device *dev = pp->dev;
 	u32 pending;
 	int virq;
 
-	pending = readl(ks_pcie->va_app_base + IRQ_STATUS + (offset << 4));
+	pending = ks_dw_app_readl(ks_pcie, IRQ_STATUS + (offset << 4));
 
 	if (BIT(0) & pending) {
 		virq = irq_linear_revmap(ks_pcie->legacy_irq_domain, offset);
-		dev_dbg(pp->dev, ": irq: irq_offset %d, virq %d\n", offset,
-			virq);
+		dev_dbg(dev, ": irq: irq_offset %d, virq %d\n", offset, virq);
 		generic_handle_irq(virq);
 	}
 
 	/* EOI the INTx interrupt */
-	writel(offset, ks_pcie->va_app_base + IRQ_EOI);
+	ks_dw_app_writel(ks_pcie, IRQ_EOI, offset);
 }
 
-void ks_dw_pcie_enable_error_irq(void __iomem *reg_base)
+void ks_dw_pcie_enable_error_irq(struct keystone_pcie *ks_pcie)
 {
-	writel(ERR_IRQ_ALL, reg_base + ERR_IRQ_ENABLE_SET);
+	ks_dw_app_writel(ks_pcie, ERR_IRQ_ENABLE_SET, ERR_IRQ_ALL);
 }
 
-irqreturn_t ks_dw_pcie_handle_error_irq(struct device *dev,
-					void __iomem *reg_base)
+irqreturn_t ks_dw_pcie_handle_error_irq(struct keystone_pcie *ks_pcie)
 {
 	u32 status;
 
-	status = readl(reg_base + ERR_IRQ_STATUS_RAW) & ERR_IRQ_ALL;
+	status = ks_dw_app_readl(ks_pcie, ERR_IRQ_STATUS_RAW) & ERR_IRQ_ALL;
 	if (!status)
 		return IRQ_NONE;
 
 	if (status & ERR_FATAL_IRQ)
-		dev_err(dev, "fatal error (status %#010x)\n", status);
+		dev_err(ks_pcie->pp.dev, "fatal error (status %#010x)\n",
+			status);
 
 	/* Ack the IRQ; status bits are RW1C */
-	writel(status, reg_base + ERR_IRQ_STATUS);
+	ks_dw_app_writel(ks_pcie, ERR_IRQ_STATUS, status);
 	return IRQ_HANDLED;
 }
 
@@ -322,15 +334,15 @@ static const struct irq_domain_ops ks_dw_pcie_legacy_irq_domain_ops = {
  * Since modification of dbi_cs2 involves different clock domain, read the
  * status back to ensure the transition is complete.
  */
-static void ks_dw_pcie_set_dbi_mode(void __iomem *reg_virt)
+static void ks_dw_pcie_set_dbi_mode(struct keystone_pcie *ks_pcie)
 {
 	u32 val;
 
-	writel(DBI_CS2_EN_VAL | readl(reg_virt + CMD_STATUS),
-	       reg_virt + CMD_STATUS);
+	val = ks_dw_app_readl(ks_pcie, CMD_STATUS);
+	ks_dw_app_writel(ks_pcie, CMD_STATUS, DBI_CS2_EN_VAL | val);
 
 	do {
-		val = readl(reg_virt + CMD_STATUS);
+		val = ks_dw_app_readl(ks_pcie, CMD_STATUS);
 	} while (!(val & DBI_CS2_EN_VAL));
 }
 
@@ -340,15 +352,15 @@ static void ks_dw_pcie_set_dbi_mode(void __iomem *reg_virt)
  * Since modification of dbi_cs2 involves different clock domain, read the
  * status back to ensure the transition is complete.
  */
-static void ks_dw_pcie_clear_dbi_mode(void __iomem *reg_virt)
+static void ks_dw_pcie_clear_dbi_mode(struct keystone_pcie *ks_pcie)
 {
 	u32 val;
 
-	writel(~DBI_CS2_EN_VAL & readl(reg_virt + CMD_STATUS),
-		     reg_virt + CMD_STATUS);
+	val = ks_dw_app_readl(ks_pcie, CMD_STATUS);
+	ks_dw_app_writel(ks_pcie, CMD_STATUS, ~DBI_CS2_EN_VAL & val);
 
 	do {
-		val = readl(reg_virt + CMD_STATUS);
+		val = ks_dw_app_readl(ks_pcie, CMD_STATUS);
 	} while (val & DBI_CS2_EN_VAL);
 }
 
@@ -357,28 +369,29 @@ void ks_dw_pcie_setup_rc_app_regs(struct keystone_pcie *ks_pcie)
 	struct pcie_port *pp = &ks_pcie->pp;
 	u32 start = pp->mem->start, end = pp->mem->end;
 	int i, tr_size;
+	u32 val;
 
 	/* Disable BARs for inbound access */
-	ks_dw_pcie_set_dbi_mode(ks_pcie->va_app_base);
-	writel(0, pp->dbi_base + PCI_BASE_ADDRESS_0);
-	writel(0, pp->dbi_base + PCI_BASE_ADDRESS_1);
-	ks_dw_pcie_clear_dbi_mode(ks_pcie->va_app_base);
+	ks_dw_pcie_set_dbi_mode(ks_pcie);
+	dw_pcie_writel_rc(pp, PCI_BASE_ADDRESS_0, 0);
+	dw_pcie_writel_rc(pp, PCI_BASE_ADDRESS_1, 0);
+	ks_dw_pcie_clear_dbi_mode(ks_pcie);
 
 	/* Set outbound translation size per window division */
-	writel(CFG_PCIM_WIN_SZ_IDX & 0x7, ks_pcie->va_app_base + OB_SIZE);
+	ks_dw_app_writel(ks_pcie, OB_SIZE, CFG_PCIM_WIN_SZ_IDX & 0x7);
 
 	tr_size = (1 << (CFG_PCIM_WIN_SZ_IDX & 0x7)) * SZ_1M;
 
 	/* Using Direct 1:1 mapping of RC <-> PCI memory space */
 	for (i = 0; (i < CFG_PCIM_WIN_CNT) && (start < end); i++) {
-		writel(start | 1, ks_pcie->va_app_base + OB_OFFSET_INDEX(i));
-		writel(0, ks_pcie->va_app_base + OB_OFFSET_HI(i));
+		ks_dw_app_writel(ks_pcie, OB_OFFSET_INDEX(i), start | 1);
+		ks_dw_app_writel(ks_pcie, OB_OFFSET_HI(i), 0);
 		start += tr_size;
 	}
 
 	/* Enable OB translation */
-	writel(OB_XLAT_EN_VAL | readl(ks_pcie->va_app_base + CMD_STATUS),
-	       ks_pcie->va_app_base + CMD_STATUS);
+	val = ks_dw_app_readl(ks_pcie, CMD_STATUS);
+	ks_dw_app_writel(ks_pcie, CMD_STATUS, OB_XLAT_EN_VAL | val);
 }
 
 /**
@@ -418,7 +431,7 @@ static void __iomem *ks_pcie_cfg_setup(struct keystone_pcie *ks_pcie, u8 bus,
 	if (bus != 1)
 		regval |= BIT(24);
 
-	writel(regval, ks_pcie->va_app_base + CFG_SETUP);
+	ks_dw_app_writel(ks_pcie, CFG_SETUP, regval);
 	return pp->va_cfg0_base;
 }
 
@@ -456,19 +469,19 @@ void ks_dw_pcie_v3_65_scan_bus(struct pcie_port *pp)
 	struct keystone_pcie *ks_pcie = to_keystone_pcie(pp);
 
 	/* Configure and set up BAR0 */
-	ks_dw_pcie_set_dbi_mode(ks_pcie->va_app_base);
+	ks_dw_pcie_set_dbi_mode(ks_pcie);
 
 	/* Enable BAR0 */
-	writel(1, pp->dbi_base + PCI_BASE_ADDRESS_0);
-	writel(SZ_4K - 1, pp->dbi_base + PCI_BASE_ADDRESS_0);
+	dw_pcie_writel_rc(pp, PCI_BASE_ADDRESS_0, 1);
+	dw_pcie_writel_rc(pp, PCI_BASE_ADDRESS_0, SZ_4K - 1);
 
-	ks_dw_pcie_clear_dbi_mode(ks_pcie->va_app_base);
+	ks_dw_pcie_clear_dbi_mode(ks_pcie);
 
 	 /*
 	  * For BAR0, just setting bus address for inbound writes (MSI) should
 	  * be sufficient.  Use physical address to avoid any conflicts.
 	  */
-	writel(ks_pcie->app.start, pp->dbi_base + PCI_BASE_ADDRESS_0);
+	dw_pcie_writel_rc(pp, PCI_BASE_ADDRESS_0, ks_pcie->app.start);
 }
 
 /**
@@ -476,8 +489,9 @@ void ks_dw_pcie_v3_65_scan_bus(struct pcie_port *pp)
  */
 int ks_dw_pcie_link_up(struct pcie_port *pp)
 {
-	u32 val = readl(pp->dbi_base + DEBUG0);
+	u32 val;
 
+	val = dw_pcie_readl_rc(pp, DEBUG0);
 	return (val & LTSSM_STATE_MASK) == LTSSM_STATE_L0;
 }
 
@@ -486,13 +500,13 @@ void ks_dw_pcie_initiate_link_train(struct keystone_pcie *ks_pcie)
 	u32 val;
 
 	/* Disable Link training */
-	val = readl(ks_pcie->va_app_base + CMD_STATUS);
+	val = ks_dw_app_readl(ks_pcie, CMD_STATUS);
 	val &= ~LTSSM_EN_VAL;
-	writel(LTSSM_EN_VAL | val,  ks_pcie->va_app_base + CMD_STATUS);
+	ks_dw_app_writel(ks_pcie, CMD_STATUS, LTSSM_EN_VAL | val);
 
 	/* Initiate Link Training */
-	val = readl(ks_pcie->va_app_base + CMD_STATUS);
-	writel(LTSSM_EN_VAL | val,  ks_pcie->va_app_base + CMD_STATUS);
+	val = ks_dw_app_readl(ks_pcie, CMD_STATUS);
+	ks_dw_app_writel(ks_pcie, CMD_STATUS, LTSSM_EN_VAL | val);
 }
 
 /**
@@ -506,12 +520,13 @@ int __init ks_dw_pcie_host_init(struct keystone_pcie *ks_pcie,
 				struct device_node *msi_intc_np)
 {
 	struct pcie_port *pp = &ks_pcie->pp;
-	struct platform_device *pdev = to_platform_device(pp->dev);
+	struct device *dev = pp->dev;
+	struct platform_device *pdev = to_platform_device(dev);
 	struct resource *res;
 
 	/* Index 0 is the config reg. space address */
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	pp->dbi_base = devm_ioremap_resource(pp->dev, res);
+	pp->dbi_base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(pp->dbi_base))
 		return PTR_ERR(pp->dbi_base);
 
@@ -524,7 +539,7 @@ int __init ks_dw_pcie_host_init(struct keystone_pcie *ks_pcie,
 
 	/* Index 1 is the application reg. space address */
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 1);
-	ks_pcie->va_app_base = devm_ioremap_resource(pp->dev, res);
+	ks_pcie->va_app_base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(ks_pcie->va_app_base))
 		return PTR_ERR(ks_pcie->va_app_base);
 
@@ -537,7 +552,7 @@ int __init ks_dw_pcie_host_init(struct keystone_pcie *ks_pcie,
 					&ks_dw_pcie_legacy_irq_domain_ops,
 					NULL);
 	if (!ks_pcie->legacy_irq_domain) {
-		dev_err(pp->dev, "Failed to add irq domain for legacy irqs\n");
+		dev_err(dev, "Failed to add irq domain for legacy irqs\n");
 		return -EINVAL;
 	}
 
diff --git a/drivers/pci/host/pci-keystone.c b/drivers/pci/host/pci-keystone.c
index 82b461b5b08a..043c19a05da1 100644
--- a/drivers/pci/host/pci-keystone.c
+++ b/drivers/pci/host/pci-keystone.c
@@ -89,12 +89,13 @@ DECLARE_PCI_FIXUP_ENABLE(PCI_ANY_ID, PCI_ANY_ID, quirk_limit_mrrs);
 static int ks_pcie_establish_link(struct keystone_pcie *ks_pcie)
 {
 	struct pcie_port *pp = &ks_pcie->pp;
+	struct device *dev = pp->dev;
 	unsigned int retries;
 
 	dw_pcie_setup_rc(pp);
 
 	if (dw_pcie_link_up(pp)) {
-		dev_err(pp->dev, "Link already up\n");
+		dev_err(dev, "Link already up\n");
 		return 0;
 	}
 
@@ -105,7 +106,7 @@ static int ks_pcie_establish_link(struct keystone_pcie *ks_pcie)
 			return 0;
 	}
 
-	dev_err(pp->dev, "phy link never came up\n");
+	dev_err(dev, "phy link never came up\n");
 	return -ETIMEDOUT;
 }
 
@@ -115,9 +116,10 @@ static void ks_pcie_msi_irq_handler(struct irq_desc *desc)
 	struct keystone_pcie *ks_pcie = irq_desc_get_handler_data(desc);
 	u32 offset = irq - ks_pcie->msi_host_irqs[0];
 	struct pcie_port *pp = &ks_pcie->pp;
+	struct device *dev = pp->dev;
 	struct irq_chip *chip = irq_desc_get_chip(desc);
 
-	dev_dbg(pp->dev, "%s, irq %d\n", __func__, irq);
+	dev_dbg(dev, "%s, irq %d\n", __func__, irq);
 
 	/*
 	 * The chained irq handler installation would have replaced normal
@@ -142,10 +144,11 @@ static void ks_pcie_legacy_irq_handler(struct irq_desc *desc)
 	unsigned int irq = irq_desc_get_irq(desc);
 	struct keystone_pcie *ks_pcie = irq_desc_get_handler_data(desc);
 	struct pcie_port *pp = &ks_pcie->pp;
+	struct device *dev = pp->dev;
 	u32 irq_offset = irq - ks_pcie->legacy_host_irqs[0];
 	struct irq_chip *chip = irq_desc_get_chip(desc);
 
-	dev_dbg(pp->dev, ": Handling legacy irq %d\n", irq);
+	dev_dbg(dev, ": Handling legacy irq %d\n", irq);
 
 	/*
 	 * The chained irq handler installation would have replaced normal
@@ -234,7 +237,7 @@ static void ks_pcie_setup_interrupts(struct keystone_pcie *ks_pcie)
 	}
 
 	if (ks_pcie->error_irq > 0)
-		ks_dw_pcie_enable_error_irq(ks_pcie->va_app_base);
+		ks_dw_pcie_enable_error_irq(ks_pcie);
 }
 
 /*
@@ -302,14 +305,14 @@ static irqreturn_t pcie_err_irq_handler(int irq, void *priv)
 {
 	struct keystone_pcie *ks_pcie = priv;
 
-	return ks_dw_pcie_handle_error_irq(ks_pcie->pp.dev,
-					   ks_pcie->va_app_base);
+	return ks_dw_pcie_handle_error_irq(ks_pcie);
 }
 
 static int __init ks_add_pcie_port(struct keystone_pcie *ks_pcie,
 			 struct platform_device *pdev)
 {
 	struct pcie_port *pp = &ks_pcie->pp;
+	struct device *dev = pp->dev;
 	int ret;
 
 	ret = ks_pcie_get_irq_controller_info(ks_pcie,
@@ -332,12 +335,12 @@ static int __init ks_add_pcie_port(struct keystone_pcie *ks_pcie,
 	 */
 	ks_pcie->error_irq = irq_of_parse_and_map(ks_pcie->np, 0);
 	if (ks_pcie->error_irq <= 0)
-		dev_info(&pdev->dev, "no error IRQ defined\n");
+		dev_info(dev, "no error IRQ defined\n");
 	else {
 		ret = request_irq(ks_pcie->error_irq, pcie_err_irq_handler,
 				  IRQF_SHARED, "pcie-error-irq", ks_pcie);
 		if (ret < 0) {
-			dev_err(&pdev->dev, "failed to request error IRQ %d\n",
+			dev_err(dev, "failed to request error IRQ %d\n",
 				ks_pcie->error_irq);
 			return ret;
 		}
@@ -347,7 +350,7 @@ static int __init ks_add_pcie_port(struct keystone_pcie *ks_pcie,
 	pp->ops = &keystone_pcie_host_ops;
 	ret = ks_dw_pcie_host_init(ks_pcie, ks_pcie->msi_intc_np);
 	if (ret) {
-		dev_err(&pdev->dev, "failed to initialize host\n");
+		dev_err(dev, "failed to initialize host\n");
 		return ret;
 	}
 
@@ -381,12 +384,12 @@ static int __init ks_pcie_probe(struct platform_device *pdev)
 	struct phy *phy;
 	int ret;
 
-	ks_pcie = devm_kzalloc(&pdev->dev, sizeof(*ks_pcie),
-				GFP_KERNEL);
+	ks_pcie = devm_kzalloc(dev, sizeof(*ks_pcie), GFP_KERNEL);
 	if (!ks_pcie)
 		return -ENOMEM;
 
 	pp = &ks_pcie->pp;
+	pp->dev = dev;
 
 	/* initialize SerDes Phy if present */
 	phy = devm_phy_get(dev, "pcie-phy");
@@ -408,7 +411,6 @@ static int __init ks_pcie_probe(struct platform_device *pdev)
 	devm_iounmap(dev, reg_p);
 	devm_release_mem_region(dev, res->start, resource_size(res));
 
-	pp->dev = dev;
 	ks_pcie->np = dev->of_node;
 	platform_set_drvdata(pdev, ks_pcie);
 	ks_pcie->clk = devm_clk_get(dev, "pcie");
diff --git a/drivers/pci/host/pci-keystone.h b/drivers/pci/host/pci-keystone.h
index a5b0cb2ba4d7..bc54bafda068 100644
--- a/drivers/pci/host/pci-keystone.h
+++ b/drivers/pci/host/pci-keystone.h
@@ -17,8 +17,8 @@
 #define MAX_LEGACY_HOST_IRQS		4
 
 struct keystone_pcie {
+	struct	pcie_port	pp;		/* pp.dbi_base is DT 0th res */
 	struct	clk		*clk;
-	struct	pcie_port	pp;
 	/* PCI Device ID */
 	u32			device_id;
 	int			num_legacy_host_irqs;
@@ -34,7 +34,7 @@ struct keystone_pcie {
 	int error_irq;
 
 	/* Application register space */
-	void __iomem		*va_app_base;
+	void __iomem		*va_app_base;	/* DT 1st resource */
 	struct resource		app;
 };
 
@@ -45,9 +45,8 @@ phys_addr_t ks_dw_pcie_get_msi_addr(struct pcie_port *pp);
 /* Keystone specific PCI controller APIs */
 void ks_dw_pcie_enable_legacy_irqs(struct keystone_pcie *ks_pcie);
 void ks_dw_pcie_handle_legacy_irq(struct keystone_pcie *ks_pcie, int offset);
-void ks_dw_pcie_enable_error_irq(void __iomem *reg_base);
-irqreturn_t ks_dw_pcie_handle_error_irq(struct device *dev,
-					void __iomem *reg_base);
+void ks_dw_pcie_enable_error_irq(struct keystone_pcie *ks_pcie);
+irqreturn_t ks_dw_pcie_handle_error_irq(struct keystone_pcie *ks_pcie);
 int  ks_dw_pcie_host_init(struct keystone_pcie *ks_pcie,
 			struct device_node *msi_intc_np);
 int ks_dw_pcie_wr_other_conf(struct pcie_port *pp, struct pci_bus *bus,
diff --git a/drivers/pci/host/pci-layerscape.c b/drivers/pci/host/pci-layerscape.c
index 114ba819277a..2cb7315e26d0 100644
--- a/drivers/pci/host/pci-layerscape.c
+++ b/drivers/pci/host/pci-layerscape.c
@@ -45,10 +45,9 @@ struct ls_pcie_drvdata {
 };
 
 struct ls_pcie {
-	void __iomem *dbi;
+	struct pcie_port pp;		/* pp.dbi_base is DT regs */
 	void __iomem *lut;
 	struct regmap *scfg;
-	struct pcie_port pp;
 	const struct ls_pcie_drvdata *drvdata;
 	int index;
 };
@@ -59,7 +58,7 @@ static bool ls_pcie_is_bridge(struct ls_pcie *pcie)
 {
 	u32 header_type;
 
-	header_type = ioread8(pcie->dbi + PCI_HEADER_TYPE);
+	header_type = ioread8(pcie->pp.dbi_base + PCI_HEADER_TYPE);
 	header_type &= 0x7f;
 
 	return header_type == PCI_HEADER_TYPE_BRIDGE;
@@ -68,13 +67,13 @@ static bool ls_pcie_is_bridge(struct ls_pcie *pcie)
 /* Clear multi-function bit */
 static void ls_pcie_clear_multifunction(struct ls_pcie *pcie)
 {
-	iowrite8(PCI_HEADER_TYPE_BRIDGE, pcie->dbi + PCI_HEADER_TYPE);
+	iowrite8(PCI_HEADER_TYPE_BRIDGE, pcie->pp.dbi_base + PCI_HEADER_TYPE);
 }
 
 /* Fix class value */
 static void ls_pcie_fix_class(struct ls_pcie *pcie)
 {
-	iowrite16(PCI_CLASS_BRIDGE_PCI, pcie->dbi + PCI_CLASS_DEVICE);
+	iowrite16(PCI_CLASS_BRIDGE_PCI, pcie->pp.dbi_base + PCI_CLASS_DEVICE);
 }
 
 /* Drop MSG TLP except for Vendor MSG */
@@ -82,9 +81,9 @@ static void ls_pcie_drop_msg_tlp(struct ls_pcie *pcie)
 {
 	u32 val;
 
-	val = ioread32(pcie->dbi + PCIE_STRFMR1);
+	val = ioread32(pcie->pp.dbi_base + PCIE_STRFMR1);
 	val &= 0xDFFFFFFF;
-	iowrite32(val, pcie->dbi + PCIE_STRFMR1);
+	iowrite32(val, pcie->pp.dbi_base + PCIE_STRFMR1);
 }
 
 static int ls1021_pcie_link_up(struct pcie_port *pp)
@@ -106,18 +105,19 @@ static int ls1021_pcie_link_up(struct pcie_port *pp)
 
 static void ls1021_pcie_host_init(struct pcie_port *pp)
 {
+	struct device *dev = pp->dev;
 	struct ls_pcie *pcie = to_ls_pcie(pp);
 	u32 index[2];
 
-	pcie->scfg = syscon_regmap_lookup_by_phandle(pp->dev->of_node,
+	pcie->scfg = syscon_regmap_lookup_by_phandle(dev->of_node,
 						     "fsl,pcie-scfg");
 	if (IS_ERR(pcie->scfg)) {
-		dev_err(pp->dev, "No syscfg phandle specified\n");
+		dev_err(dev, "No syscfg phandle specified\n");
 		pcie->scfg = NULL;
 		return;
 	}
 
-	if (of_property_read_u32_array(pp->dev->of_node,
+	if (of_property_read_u32_array(dev->of_node,
 				       "fsl,pcie-scfg", index, 2)) {
 		pcie->scfg = NULL;
 		return;
@@ -148,18 +148,19 @@ static void ls_pcie_host_init(struct pcie_port *pp)
 {
 	struct ls_pcie *pcie = to_ls_pcie(pp);
 
-	iowrite32(1, pcie->dbi + PCIE_DBI_RO_WR_EN);
+	iowrite32(1, pcie->pp.dbi_base + PCIE_DBI_RO_WR_EN);
 	ls_pcie_fix_class(pcie);
 	ls_pcie_clear_multifunction(pcie);
 	ls_pcie_drop_msg_tlp(pcie);
-	iowrite32(0, pcie->dbi + PCIE_DBI_RO_WR_EN);
+	iowrite32(0, pcie->pp.dbi_base + PCIE_DBI_RO_WR_EN);
 }
 
 static int ls_pcie_msi_host_init(struct pcie_port *pp,
 				 struct msi_controller *chip)
 {
+	struct device *dev = pp->dev;
+	struct device_node *np = dev->of_node;
 	struct device_node *msi_node;
-	struct device_node *np = pp->dev->of_node;
 
 	/*
 	 * The MSI domain is set by the generic of_msi_configure().  This
@@ -169,7 +170,7 @@ static int ls_pcie_msi_host_init(struct pcie_port *pp,
 	 */
 	msi_node = of_parse_phandle(np, "msi-parent", 0);
 	if (!msi_node) {
-		dev_err(pp->dev, "failed to find msi-parent\n");
+		dev_err(dev, "failed to find msi-parent\n");
 		return -EINVAL;
 	}
 
@@ -212,19 +213,15 @@ static const struct of_device_id ls_pcie_of_match[] = {
 	{ },
 };
 
-static int __init ls_add_pcie_port(struct pcie_port *pp,
-				   struct platform_device *pdev)
+static int __init ls_add_pcie_port(struct ls_pcie *pcie)
 {
+	struct pcie_port *pp = &pcie->pp;
+	struct device *dev = pp->dev;
 	int ret;
-	struct ls_pcie *pcie = to_ls_pcie(pp);
-
-	pp->dev = &pdev->dev;
-	pp->dbi_base = pcie->dbi;
-	pp->ops = pcie->drvdata->ops;
 
 	ret = dw_pcie_host_init(pp);
 	if (ret) {
-		dev_err(pp->dev, "failed to initialize host\n");
+		dev_err(dev, "failed to initialize host\n");
 		return ret;
 	}
 
@@ -233,38 +230,42 @@ static int __init ls_add_pcie_port(struct pcie_port *pp,
 
 static int __init ls_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	const struct of_device_id *match;
 	struct ls_pcie *pcie;
+	struct pcie_port *pp;
 	struct resource *dbi_base;
 	int ret;
 
-	match = of_match_device(ls_pcie_of_match, &pdev->dev);
+	match = of_match_device(ls_pcie_of_match, dev);
 	if (!match)
 		return -ENODEV;
 
-	pcie = devm_kzalloc(&pdev->dev, sizeof(*pcie), GFP_KERNEL);
+	pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
 	if (!pcie)
 		return -ENOMEM;
 
+	pp = &pcie->pp;
+	pp->dev = dev;
+	pp->ops = pcie->drvdata->ops;
+
 	dbi_base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "regs");
-	pcie->dbi = devm_ioremap_resource(&pdev->dev, dbi_base);
-	if (IS_ERR(pcie->dbi)) {
-		dev_err(&pdev->dev, "missing *regs* space\n");
-		return PTR_ERR(pcie->dbi);
+	pcie->pp.dbi_base = devm_ioremap_resource(dev, dbi_base);
+	if (IS_ERR(pcie->pp.dbi_base)) {
+		dev_err(dev, "missing *regs* space\n");
+		return PTR_ERR(pcie->pp.dbi_base);
 	}
 
 	pcie->drvdata = match->data;
-	pcie->lut = pcie->dbi + pcie->drvdata->lut_offset;
+	pcie->lut = pcie->pp.dbi_base + pcie->drvdata->lut_offset;
 
 	if (!ls_pcie_is_bridge(pcie))
 		return -ENODEV;
 
-	ret = ls_add_pcie_port(&pcie->pp, pdev);
+	ret = ls_add_pcie_port(pcie);
 	if (ret < 0)
 		return ret;
 
-	platform_set_drvdata(pdev, pcie);
-
 	return 0;
 }
 
diff --git a/drivers/pci/host/pci-mvebu.c b/drivers/pci/host/pci-mvebu.c
index 307f81d6b479..45a89d969700 100644
--- a/drivers/pci/host/pci-mvebu.c
+++ b/drivers/pci/host/pci-mvebu.c
@@ -1190,13 +1190,13 @@ static void mvebu_pcie_powerdown(struct mvebu_pcie_port *port)
 
 static int mvebu_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct mvebu_pcie *pcie;
-	struct device_node *np = pdev->dev.of_node;
+	struct device_node *np = dev->of_node;
 	struct device_node *child;
 	int num, i, ret;
 
-	pcie = devm_kzalloc(&pdev->dev, sizeof(struct mvebu_pcie),
-			    GFP_KERNEL);
+	pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
 	if (!pcie)
 		return -ENOMEM;
 
@@ -1206,7 +1206,7 @@ static int mvebu_pcie_probe(struct platform_device *pdev)
 	/* Get the PCIe memory and I/O aperture */
 	mvebu_mbus_get_pcie_mem_aperture(&pcie->mem);
 	if (resource_size(&pcie->mem) == 0) {
-		dev_err(&pdev->dev, "invalid memory aperture size\n");
+		dev_err(dev, "invalid memory aperture size\n");
 		return -EINVAL;
 	}
 
@@ -1224,20 +1224,18 @@ static int mvebu_pcie_probe(struct platform_device *pdev)
 	/* Get the bus range */
 	ret = of_pci_parse_bus_range(np, &pcie->busn);
 	if (ret) {
-		dev_err(&pdev->dev, "failed to parse bus-range property: %d\n",
-			ret);
+		dev_err(dev, "failed to parse bus-range property: %d\n", ret);
 		return ret;
 	}
 
-	num = of_get_available_child_count(pdev->dev.of_node);
+	num = of_get_available_child_count(np);
 
-	pcie->ports = devm_kcalloc(&pdev->dev, num, sizeof(*pcie->ports),
-				   GFP_KERNEL);
+	pcie->ports = devm_kcalloc(dev, num, sizeof(*pcie->ports), GFP_KERNEL);
 	if (!pcie->ports)
 		return -ENOMEM;
 
 	i = 0;
-	for_each_available_child_of_node(pdev->dev.of_node, child) {
+	for_each_available_child_of_node(np, child) {
 		struct mvebu_pcie_port *port = &pcie->ports[i];
 
 		ret = mvebu_pcie_parse_port(pcie, port, child);
@@ -1266,8 +1264,7 @@ static int mvebu_pcie_probe(struct platform_device *pdev)
 
 		port->base = mvebu_pcie_map_registers(pdev, child, port);
 		if (IS_ERR(port->base)) {
-			dev_err(&pdev->dev, "%s: cannot map registers\n",
-				port->name);
+			dev_err(dev, "%s: cannot map registers\n", port->name);
 			port->base = NULL;
 			mvebu_pcie_powerdown(port);
 			continue;
diff --git a/drivers/pci/host/pci-rcar-gen2.c b/drivers/pci/host/pci-rcar-gen2.c
index 597566f96f5e..1eeefa4df64c 100644
--- a/drivers/pci/host/pci-rcar-gen2.c
+++ b/drivers/pci/host/pci-rcar-gen2.c
@@ -154,10 +154,11 @@ static int rcar_pci_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
 static irqreturn_t rcar_pci_err_irq(int irq, void *pw)
 {
 	struct rcar_pci_priv *priv = pw;
+	struct device *dev = priv->dev;
 	u32 status = ioread32(priv->reg + RCAR_PCI_INT_STATUS_REG);
 
 	if (status & RCAR_PCI_INT_ALLERRORS) {
-		dev_err(priv->dev, "error irq: status %08x\n", status);
+		dev_err(dev, "error irq: status %08x\n", status);
 
 		/* clear the error(s) */
 		iowrite32(status & RCAR_PCI_INT_ALLERRORS,
@@ -170,13 +171,14 @@ static irqreturn_t rcar_pci_err_irq(int irq, void *pw)
 
 static void rcar_pci_setup_errirq(struct rcar_pci_priv *priv)
 {
+	struct device *dev = priv->dev;
 	int ret;
 	u32 val;
 
-	ret = devm_request_irq(priv->dev, priv->irq, rcar_pci_err_irq,
+	ret = devm_request_irq(dev, priv->irq, rcar_pci_err_irq,
 			       IRQF_SHARED, "error irq", priv);
 	if (ret) {
-		dev_err(priv->dev, "cannot claim IRQ for error handling\n");
+		dev_err(dev, "cannot claim IRQ for error handling\n");
 		return;
 	}
 
@@ -192,15 +194,16 @@ static inline void rcar_pci_setup_errirq(struct rcar_pci_priv *priv) { }
 static int rcar_pci_setup(int nr, struct pci_sys_data *sys)
 {
 	struct rcar_pci_priv *priv = sys->private_data;
+	struct device *dev = priv->dev;
 	void __iomem *reg = priv->reg;
 	u32 val;
 	int ret;
 
-	pm_runtime_enable(priv->dev);
-	pm_runtime_get_sync(priv->dev);
+	pm_runtime_enable(dev);
+	pm_runtime_get_sync(dev);
 
 	val = ioread32(reg + RCAR_PCI_UNIT_REV_REG);
-	dev_info(priv->dev, "PCI: bus%u revision %x\n", sys->busnr, val);
+	dev_info(dev, "PCI: bus%u revision %x\n", sys->busnr, val);
 
 	/* Disable Direct Power Down State and assert reset */
 	val = ioread32(reg + RCAR_USBCTR_REG) & ~RCAR_USBCTR_DIRPD;
@@ -275,7 +278,7 @@ static int rcar_pci_setup(int nr, struct pci_sys_data *sys)
 
 	/* Add PCI resources */
 	pci_add_resource(&sys->resources, &priv->mem_res);
-	ret = devm_request_pci_bus_resources(priv->dev, &sys->resources);
+	ret = devm_request_pci_bus_resources(dev, &sys->resources);
 	if (ret < 0)
 		return ret;
 
@@ -311,6 +314,7 @@ static int pci_dma_range_parser_init(struct of_pci_range_parser *parser,
 static int rcar_pci_parse_map_dma_ranges(struct rcar_pci_priv *pci,
 					 struct device_node *np)
 {
+	struct device *dev = pci->dev;
 	struct of_pci_range range;
 	struct of_pci_range_parser parser;
 	int index = 0;
@@ -331,14 +335,14 @@ static int rcar_pci_parse_map_dma_ranges(struct rcar_pci_priv *pci,
 
 		/* Catch HW limitations */
 		if (!(range.flags & IORESOURCE_PREFETCH)) {
-			dev_err(pci->dev, "window must be prefetchable\n");
+			dev_err(dev, "window must be prefetchable\n");
 			return -EINVAL;
 		}
 		if (pci->window_addr) {
 			u32 lowaddr = 1 << (ffs(pci->window_addr) - 1);
 
 			if (lowaddr < pci->window_size) {
-				dev_err(pci->dev, "invalid window size/addr\n");
+				dev_err(dev, "invalid window size/addr\n");
 				return -EINVAL;
 			}
 		}
@@ -350,6 +354,7 @@ static int rcar_pci_parse_map_dma_ranges(struct rcar_pci_priv *pci,
 
 static int rcar_pci_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct resource *cfg_res, *mem_res;
 	struct rcar_pci_priv *priv;
 	void __iomem *reg;
@@ -357,7 +362,7 @@ static int rcar_pci_probe(struct platform_device *pdev)
 	void *hw_private[1];
 
 	cfg_res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	reg = devm_ioremap_resource(&pdev->dev, cfg_res);
+	reg = devm_ioremap_resource(dev, cfg_res);
 	if (IS_ERR(reg))
 		return PTR_ERR(reg);
 
@@ -368,8 +373,7 @@ static int rcar_pci_probe(struct platform_device *pdev)
 	if (mem_res->start & 0xFFFF)
 		return -EINVAL;
 
-	priv = devm_kzalloc(&pdev->dev,
-			    sizeof(struct rcar_pci_priv), GFP_KERNEL);
+	priv = devm_kzalloc(dev, sizeof(struct rcar_pci_priv), GFP_KERNEL);
 	if (!priv)
 		return -ENOMEM;
 
@@ -378,10 +382,10 @@ static int rcar_pci_probe(struct platform_device *pdev)
 
 	priv->irq = platform_get_irq(pdev, 0);
 	priv->reg = reg;
-	priv->dev = &pdev->dev;
+	priv->dev = dev;
 
 	if (priv->irq < 0) {
-		dev_err(&pdev->dev, "no valid irq found\n");
+		dev_err(dev, "no valid irq found\n");
 		return priv->irq;
 	}
 
@@ -390,23 +394,23 @@ static int rcar_pci_probe(struct platform_device *pdev)
 	priv->window_pci = 0x40000000;
 	priv->window_size = SZ_1G;
 
-	if (pdev->dev.of_node) {
+	if (dev->of_node) {
 		struct resource busnr;
 		int ret;
 
-		ret = of_pci_parse_bus_range(pdev->dev.of_node, &busnr);
+		ret = of_pci_parse_bus_range(dev->of_node, &busnr);
 		if (ret < 0) {
-			dev_err(&pdev->dev, "failed to parse bus-range\n");
+			dev_err(dev, "failed to parse bus-range\n");
 			return ret;
 		}
 
 		priv->busnr = busnr.start;
 		if (busnr.end != busnr.start)
-			dev_warn(&pdev->dev, "only one bus number supported\n");
+			dev_warn(dev, "only one bus number supported\n");
 
-		ret = rcar_pci_parse_map_dma_ranges(priv, pdev->dev.of_node);
+		ret = rcar_pci_parse_map_dma_ranges(priv, dev->of_node);
 		if (ret < 0) {
-			dev_err(&pdev->dev, "failed to parse dma-range\n");
+			dev_err(dev, "failed to parse dma-range\n");
 			return ret;
 		}
 	} else {
@@ -421,7 +425,7 @@ static int rcar_pci_probe(struct platform_device *pdev)
 	hw.map_irq = rcar_pci_map_irq;
 	hw.ops = &rcar_pci_ops;
 	hw.setup = rcar_pci_setup;
-	pci_common_init_dev(&pdev->dev, &hw);
+	pci_common_init_dev(dev, &hw);
 	return 0;
 }
 
diff --git a/drivers/pci/host/pci-tegra.c b/drivers/pci/host/pci-tegra.c
index e2a8e4cab22e..8dfccf733241 100644
--- a/drivers/pci/host/pci-tegra.c
+++ b/drivers/pci/host/pci-tegra.c
@@ -384,6 +384,7 @@ static unsigned long tegra_pcie_conf_offset(unsigned int devfn, int where)
 static struct tegra_pcie_bus *tegra_pcie_bus_alloc(struct tegra_pcie *pcie,
 						   unsigned int busnr)
 {
+	struct device *dev = pcie->dev;
 	pgprot_t prot = __pgprot(L_PTE_PRESENT | L_PTE_YOUNG | L_PTE_DIRTY |
 				 L_PTE_XN | L_PTE_MT_DEV_SHARED | L_PTE_SHARED);
 	phys_addr_t cs = pcie->cs->start;
@@ -413,8 +414,7 @@ static struct tegra_pcie_bus *tegra_pcie_bus_alloc(struct tegra_pcie *pcie,
 
 		err = ioremap_page_range(virt, virt + SZ_64K, phys, prot);
 		if (err < 0) {
-			dev_err(pcie->dev, "ioremap_page_range() failed: %d\n",
-				err);
+			dev_err(dev, "ioremap_page_range() failed: %d\n", err);
 			goto unmap;
 		}
 	}
@@ -462,6 +462,7 @@ static void __iomem *tegra_pcie_map_bus(struct pci_bus *bus,
 					int where)
 {
 	struct tegra_pcie *pcie = sys_to_pcie(bus->sysdata);
+	struct device *dev = pcie->dev;
 	void __iomem *addr = NULL;
 
 	if (bus->number == 0) {
@@ -482,8 +483,7 @@ static void __iomem *tegra_pcie_map_bus(struct pci_bus *bus,
 				addr = (void __iomem *)b->area->addr;
 
 		if (!addr) {
-			dev_err(pcie->dev,
-				"failed to map cfg. space for bus %u\n",
+			dev_err(dev, "failed to map cfg. space for bus %u\n",
 				bus->number);
 			return NULL;
 		}
@@ -584,12 +584,13 @@ static void tegra_pcie_port_disable(struct tegra_pcie_port *port)
 static void tegra_pcie_port_free(struct tegra_pcie_port *port)
 {
 	struct tegra_pcie *pcie = port->pcie;
+	struct device *dev = pcie->dev;
 
-	devm_iounmap(pcie->dev, port->base);
-	devm_release_mem_region(pcie->dev, port->regs.start,
+	devm_iounmap(dev, port->base);
+	devm_release_mem_region(dev, port->regs.start,
 				resource_size(&port->regs));
 	list_del(&port->list);
-	devm_kfree(pcie->dev, port);
+	devm_kfree(dev, port);
 }
 
 /* Tegra PCIE root complex wrongly reports device class */
@@ -612,12 +613,13 @@ DECLARE_PCI_FIXUP_FINAL(PCI_ANY_ID, PCI_ANY_ID, tegra_pcie_relax_enable);
 static int tegra_pcie_setup(int nr, struct pci_sys_data *sys)
 {
 	struct tegra_pcie *pcie = sys_to_pcie(sys);
+	struct device *dev = pcie->dev;
 	int err;
 
 	sys->mem_offset = pcie->offset.mem;
 	sys->io_offset = pcie->offset.io;
 
-	err = devm_request_resource(pcie->dev, &iomem_resource, &pcie->io);
+	err = devm_request_resource(dev, &iomem_resource, &pcie->io);
 	if (err < 0)
 		return err;
 
@@ -631,7 +633,7 @@ static int tegra_pcie_setup(int nr, struct pci_sys_data *sys)
 				sys->mem_offset);
 	pci_add_resource(&sys->resources, &pcie->busn);
 
-	err = devm_request_pci_bus_resources(pcie->dev, &sys->resources);
+	err = devm_request_pci_bus_resources(dev, &sys->resources);
 	if (err < 0)
 		return err;
 
@@ -672,6 +674,7 @@ static irqreturn_t tegra_pcie_isr(int irq, void *arg)
 		"Peer2Peer error",
 	};
 	struct tegra_pcie *pcie = arg;
+	struct device *dev = pcie->dev;
 	u32 code, signature;
 
 	code = afi_readl(pcie, AFI_INTR_CODE) & AFI_INTR_CODE_MASK;
@@ -689,11 +692,9 @@ static irqreturn_t tegra_pcie_isr(int irq, void *arg)
 	 * happen a lot during enumeration
 	 */
 	if (code == AFI_INTR_MASTER_ABORT)
-		dev_dbg(pcie->dev, "%s, signature: %08x\n", err_msg[code],
-			signature);
+		dev_dbg(dev, "%s, signature: %08x\n", err_msg[code], signature);
 	else
-		dev_err(pcie->dev, "%s, signature: %08x\n", err_msg[code],
-			signature);
+		dev_err(dev, "%s, signature: %08x\n", err_msg[code], signature);
 
 	if (code == AFI_INTR_TARGET_ABORT || code == AFI_INTR_MASTER_ABORT ||
 	    code == AFI_INTR_FPCI_DECODE_ERROR) {
@@ -701,9 +702,9 @@ static irqreturn_t tegra_pcie_isr(int irq, void *arg)
 		u64 address = (u64)fpci << 32 | (signature & 0xfffffffc);
 
 		if (code == AFI_INTR_MASTER_ABORT)
-			dev_dbg(pcie->dev, "  FPCI address: %10llx\n", address);
+			dev_dbg(dev, "  FPCI address: %10llx\n", address);
 		else
-			dev_err(pcie->dev, "  FPCI address: %10llx\n", address);
+			dev_err(dev, "  FPCI address: %10llx\n", address);
 	}
 
 	return IRQ_HANDLED;
@@ -793,6 +794,7 @@ static int tegra_pcie_pll_wait(struct tegra_pcie *pcie, unsigned long timeout)
 
 static int tegra_pcie_phy_enable(struct tegra_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	const struct tegra_pcie_soc *soc = pcie->soc;
 	u32 value;
 	int err;
@@ -829,7 +831,7 @@ static int tegra_pcie_phy_enable(struct tegra_pcie *pcie)
 	/* wait for the PLL to lock */
 	err = tegra_pcie_pll_wait(pcie, 500);
 	if (err < 0) {
-		dev_err(pcie->dev, "PLL failed to lock: %d\n", err);
+		dev_err(dev, "PLL failed to lock: %d\n", err);
 		return err;
 	}
 
@@ -859,7 +861,7 @@ static int tegra_pcie_phy_disable(struct tegra_pcie *pcie)
 	/* override IDDQ */
 	value = pads_readl(pcie, PADS_CTL);
 	value |= PADS_CTL_IDDQ_1L;
-	pads_writel(pcie, PADS_CTL, value);
+	pads_writel(pcie, value, PADS_CTL);
 
 	/* reset PLL */
 	value = pads_readl(pcie, soc->pads_pll_ctl);
@@ -880,8 +882,7 @@ static int tegra_pcie_port_phy_power_on(struct tegra_pcie_port *port)
 	for (i = 0; i < port->lanes; i++) {
 		err = phy_power_on(port->phys[i]);
 		if (err < 0) {
-			dev_err(dev, "failed to power on PHY#%u: %d\n", i,
-				err);
+			dev_err(dev, "failed to power on PHY#%u: %d\n", i, err);
 			return err;
 		}
 	}
@@ -909,6 +910,7 @@ static int tegra_pcie_port_phy_power_off(struct tegra_pcie_port *port)
 
 static int tegra_pcie_phy_power_on(struct tegra_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	const struct tegra_pcie_soc *soc = pcie->soc;
 	struct tegra_pcie_port *port;
 	int err;
@@ -920,7 +922,7 @@ static int tegra_pcie_phy_power_on(struct tegra_pcie *pcie)
 			err = tegra_pcie_phy_enable(pcie);
 
 		if (err < 0)
-			dev_err(pcie->dev, "failed to power on PHY: %d\n", err);
+			dev_err(dev, "failed to power on PHY: %d\n", err);
 
 		return err;
 	}
@@ -928,7 +930,7 @@ static int tegra_pcie_phy_power_on(struct tegra_pcie *pcie)
 	list_for_each_entry(port, &pcie->ports, list) {
 		err = tegra_pcie_port_phy_power_on(port);
 		if (err < 0) {
-			dev_err(pcie->dev,
+			dev_err(dev,
 				"failed to power on PCIe port %u PHY: %d\n",
 				port->index, err);
 			return err;
@@ -946,6 +948,7 @@ static int tegra_pcie_phy_power_on(struct tegra_pcie *pcie)
 
 static int tegra_pcie_phy_power_off(struct tegra_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	struct tegra_pcie_port *port;
 	int err;
 
@@ -956,8 +959,7 @@ static int tegra_pcie_phy_power_off(struct tegra_pcie *pcie)
 			err = tegra_pcie_phy_disable(pcie);
 
 		if (err < 0)
-			dev_err(pcie->dev, "failed to power off PHY: %d\n",
-				err);
+			dev_err(dev, "failed to power off PHY: %d\n", err);
 
 		return err;
 	}
@@ -965,7 +967,7 @@ static int tegra_pcie_phy_power_off(struct tegra_pcie *pcie)
 	list_for_each_entry(port, &pcie->ports, list) {
 		err = tegra_pcie_port_phy_power_off(port);
 		if (err < 0) {
-			dev_err(pcie->dev,
+			dev_err(dev,
 				"failed to power off PCIe port %u PHY: %d\n",
 				port->index, err);
 			return err;
@@ -977,6 +979,7 @@ static int tegra_pcie_phy_power_off(struct tegra_pcie *pcie)
 
 static int tegra_pcie_enable_controller(struct tegra_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	const struct tegra_pcie_soc *soc = pcie->soc;
 	struct tegra_pcie_port *port;
 	unsigned long value;
@@ -1016,7 +1019,7 @@ static int tegra_pcie_enable_controller(struct tegra_pcie *pcie)
 
 	err = tegra_pcie_phy_power_on(pcie);
 	if (err < 0) {
-		dev_err(pcie->dev, "failed to power on PHY(s): %d\n", err);
+		dev_err(dev, "failed to power on PHY(s): %d\n", err);
 		return err;
 	}
 
@@ -1049,13 +1052,14 @@ static int tegra_pcie_enable_controller(struct tegra_pcie *pcie)
 
 static void tegra_pcie_power_off(struct tegra_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	int err;
 
 	/* TODO: disable and unprepare clocks? */
 
 	err = tegra_pcie_phy_power_off(pcie);
 	if (err < 0)
-		dev_err(pcie->dev, "failed to power off PHY(s): %d\n", err);
+		dev_err(dev, "failed to power off PHY(s): %d\n", err);
 
 	reset_control_assert(pcie->pcie_xrst);
 	reset_control_assert(pcie->afi_rst);
@@ -1065,11 +1069,12 @@ static void tegra_pcie_power_off(struct tegra_pcie *pcie)
 
 	err = regulator_bulk_disable(pcie->num_supplies, pcie->supplies);
 	if (err < 0)
-		dev_warn(pcie->dev, "failed to disable regulators: %d\n", err);
+		dev_warn(dev, "failed to disable regulators: %d\n", err);
 }
 
 static int tegra_pcie_power_on(struct tegra_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	const struct tegra_pcie_soc *soc = pcie->soc;
 	int err;
 
@@ -1082,13 +1087,13 @@ static int tegra_pcie_power_on(struct tegra_pcie *pcie)
 	/* enable regulators */
 	err = regulator_bulk_enable(pcie->num_supplies, pcie->supplies);
 	if (err < 0)
-		dev_err(pcie->dev, "failed to enable regulators: %d\n", err);
+		dev_err(dev, "failed to enable regulators: %d\n", err);
 
 	err = tegra_powergate_sequence_power_up(TEGRA_POWERGATE_PCIE,
 						pcie->pex_clk,
 						pcie->pex_rst);
 	if (err) {
-		dev_err(pcie->dev, "powerup sequence failed: %d\n", err);
+		dev_err(dev, "powerup sequence failed: %d\n", err);
 		return err;
 	}
 
@@ -1096,22 +1101,21 @@ static int tegra_pcie_power_on(struct tegra_pcie *pcie)
 
 	err = clk_prepare_enable(pcie->afi_clk);
 	if (err < 0) {
-		dev_err(pcie->dev, "failed to enable AFI clock: %d\n", err);
+		dev_err(dev, "failed to enable AFI clock: %d\n", err);
 		return err;
 	}
 
 	if (soc->has_cml_clk) {
 		err = clk_prepare_enable(pcie->cml_clk);
 		if (err < 0) {
-			dev_err(pcie->dev, "failed to enable CML clock: %d\n",
-				err);
+			dev_err(dev, "failed to enable CML clock: %d\n", err);
 			return err;
 		}
 	}
 
 	err = clk_prepare_enable(pcie->pll_e);
 	if (err < 0) {
-		dev_err(pcie->dev, "failed to enable PLLE clock: %d\n", err);
+		dev_err(dev, "failed to enable PLLE clock: %d\n", err);
 		return err;
 	}
 
@@ -1120,22 +1124,23 @@ static int tegra_pcie_power_on(struct tegra_pcie *pcie)
 
 static int tegra_pcie_clocks_get(struct tegra_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	const struct tegra_pcie_soc *soc = pcie->soc;
 
-	pcie->pex_clk = devm_clk_get(pcie->dev, "pex");
+	pcie->pex_clk = devm_clk_get(dev, "pex");
 	if (IS_ERR(pcie->pex_clk))
 		return PTR_ERR(pcie->pex_clk);
 
-	pcie->afi_clk = devm_clk_get(pcie->dev, "afi");
+	pcie->afi_clk = devm_clk_get(dev, "afi");
 	if (IS_ERR(pcie->afi_clk))
 		return PTR_ERR(pcie->afi_clk);
 
-	pcie->pll_e = devm_clk_get(pcie->dev, "pll_e");
+	pcie->pll_e = devm_clk_get(dev, "pll_e");
 	if (IS_ERR(pcie->pll_e))
 		return PTR_ERR(pcie->pll_e);
 
 	if (soc->has_cml_clk) {
-		pcie->cml_clk = devm_clk_get(pcie->dev, "cml");
+		pcie->cml_clk = devm_clk_get(dev, "cml");
 		if (IS_ERR(pcie->cml_clk))
 			return PTR_ERR(pcie->cml_clk);
 	}
@@ -1145,15 +1150,17 @@ static int tegra_pcie_clocks_get(struct tegra_pcie *pcie)
 
 static int tegra_pcie_resets_get(struct tegra_pcie *pcie)
 {
-	pcie->pex_rst = devm_reset_control_get(pcie->dev, "pex");
+	struct device *dev = pcie->dev;
+
+	pcie->pex_rst = devm_reset_control_get(dev, "pex");
 	if (IS_ERR(pcie->pex_rst))
 		return PTR_ERR(pcie->pex_rst);
 
-	pcie->afi_rst = devm_reset_control_get(pcie->dev, "afi");
+	pcie->afi_rst = devm_reset_control_get(dev, "afi");
 	if (IS_ERR(pcie->afi_rst))
 		return PTR_ERR(pcie->afi_rst);
 
-	pcie->pcie_xrst = devm_reset_control_get(pcie->dev, "pcie_x");
+	pcie->pcie_xrst = devm_reset_control_get(dev, "pcie_x");
 	if (IS_ERR(pcie->pcie_xrst))
 		return PTR_ERR(pcie->pcie_xrst);
 
@@ -1162,18 +1169,19 @@ static int tegra_pcie_resets_get(struct tegra_pcie *pcie)
 
 static int tegra_pcie_phys_get_legacy(struct tegra_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	int err;
 
-	pcie->phy = devm_phy_optional_get(pcie->dev, "pcie");
+	pcie->phy = devm_phy_optional_get(dev, "pcie");
 	if (IS_ERR(pcie->phy)) {
 		err = PTR_ERR(pcie->phy);
-		dev_err(pcie->dev, "failed to get PHY: %d\n", err);
+		dev_err(dev, "failed to get PHY: %d\n", err);
 		return err;
 	}
 
 	err = phy_init(pcie->phy);
 	if (err < 0) {
-		dev_err(pcie->dev, "failed to initialize PHY: %d\n", err);
+		dev_err(dev, "failed to initialize PHY: %d\n", err);
 		return err;
 	}
 
@@ -1256,43 +1264,44 @@ static int tegra_pcie_phys_get(struct tegra_pcie *pcie)
 
 static int tegra_pcie_get_resources(struct tegra_pcie *pcie)
 {
-	struct platform_device *pdev = to_platform_device(pcie->dev);
+	struct device *dev = pcie->dev;
+	struct platform_device *pdev = to_platform_device(dev);
 	struct resource *pads, *afi, *res;
 	int err;
 
 	err = tegra_pcie_clocks_get(pcie);
 	if (err) {
-		dev_err(&pdev->dev, "failed to get clocks: %d\n", err);
+		dev_err(dev, "failed to get clocks: %d\n", err);
 		return err;
 	}
 
 	err = tegra_pcie_resets_get(pcie);
 	if (err) {
-		dev_err(&pdev->dev, "failed to get resets: %d\n", err);
+		dev_err(dev, "failed to get resets: %d\n", err);
 		return err;
 	}
 
 	err = tegra_pcie_phys_get(pcie);
 	if (err < 0) {
-		dev_err(&pdev->dev, "failed to get PHYs: %d\n", err);
+		dev_err(dev, "failed to get PHYs: %d\n", err);
 		return err;
 	}
 
 	err = tegra_pcie_power_on(pcie);
 	if (err) {
-		dev_err(&pdev->dev, "failed to power up: %d\n", err);
+		dev_err(dev, "failed to power up: %d\n", err);
 		return err;
 	}
 
 	pads = platform_get_resource_byname(pdev, IORESOURCE_MEM, "pads");
-	pcie->pads = devm_ioremap_resource(&pdev->dev, pads);
+	pcie->pads = devm_ioremap_resource(dev, pads);
 	if (IS_ERR(pcie->pads)) {
 		err = PTR_ERR(pcie->pads);
 		goto poweroff;
 	}
 
 	afi = platform_get_resource_byname(pdev, IORESOURCE_MEM, "afi");
-	pcie->afi = devm_ioremap_resource(&pdev->dev, afi);
+	pcie->afi = devm_ioremap_resource(dev, afi);
 	if (IS_ERR(pcie->afi)) {
 		err = PTR_ERR(pcie->afi);
 		goto poweroff;
@@ -1305,7 +1314,7 @@ static int tegra_pcie_get_resources(struct tegra_pcie *pcie)
 		goto poweroff;
 	}
 
-	pcie->cs = devm_request_mem_region(pcie->dev, res->start,
+	pcie->cs = devm_request_mem_region(dev, res->start,
 					   resource_size(res), res->name);
 	if (!pcie->cs) {
 		err = -EADDRNOTAVAIL;
@@ -1315,7 +1324,7 @@ static int tegra_pcie_get_resources(struct tegra_pcie *pcie)
 	/* request interrupt */
 	err = platform_get_irq_byname(pdev, "intr");
 	if (err < 0) {
-		dev_err(&pdev->dev, "failed to get IRQ: %d\n", err);
+		dev_err(dev, "failed to get IRQ: %d\n", err);
 		goto poweroff;
 	}
 
@@ -1323,7 +1332,7 @@ static int tegra_pcie_get_resources(struct tegra_pcie *pcie)
 
 	err = request_irq(pcie->irq, tegra_pcie_isr, IRQF_SHARED, "PCIE", pcie);
 	if (err) {
-		dev_err(&pdev->dev, "failed to register IRQ: %d\n", err);
+		dev_err(dev, "failed to register IRQ: %d\n", err);
 		goto poweroff;
 	}
 
@@ -1336,6 +1345,7 @@ poweroff:
 
 static int tegra_pcie_put_resources(struct tegra_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	int err;
 
 	if (pcie->irq > 0)
@@ -1345,7 +1355,7 @@ static int tegra_pcie_put_resources(struct tegra_pcie *pcie)
 
 	err = phy_exit(pcie->phy);
 	if (err < 0)
-		dev_err(pcie->dev, "failed to teardown PHY: %d\n", err);
+		dev_err(dev, "failed to teardown PHY: %d\n", err);
 
 	return 0;
 }
@@ -1384,6 +1394,7 @@ static void tegra_msi_free(struct tegra_msi *chip, unsigned long irq)
 static irqreturn_t tegra_pcie_msi_irq(int irq, void *data)
 {
 	struct tegra_pcie *pcie = data;
+	struct device *dev = pcie->dev;
 	struct tegra_msi *msi = &pcie->msi;
 	unsigned int i, processed = 0;
 
@@ -1403,13 +1414,13 @@ static irqreturn_t tegra_pcie_msi_irq(int irq, void *data)
 				if (test_bit(index, msi->used))
 					generic_handle_irq(irq);
 				else
-					dev_info(pcie->dev, "unhandled MSI\n");
+					dev_info(dev, "unhandled MSI\n");
 			} else {
 				/*
 				 * that's weird who triggered this?
 				 * just clear it
 				 */
-				dev_info(pcie->dev, "unexpected MSI\n");
+				dev_info(dev, "unexpected MSI\n");
 			}
 
 			/* see if there's any more pending in this vector */
@@ -1488,7 +1499,8 @@ static const struct irq_domain_ops msi_domain_ops = {
 
 static int tegra_pcie_enable_msi(struct tegra_pcie *pcie)
 {
-	struct platform_device *pdev = to_platform_device(pcie->dev);
+	struct device *dev = pcie->dev;
+	struct platform_device *pdev = to_platform_device(dev);
 	const struct tegra_pcie_soc *soc = pcie->soc;
 	struct tegra_msi *msi = &pcie->msi;
 	unsigned long base;
@@ -1497,20 +1509,20 @@ static int tegra_pcie_enable_msi(struct tegra_pcie *pcie)
 
 	mutex_init(&msi->lock);
 
-	msi->chip.dev = pcie->dev;
+	msi->chip.dev = dev;
 	msi->chip.setup_irq = tegra_msi_setup_irq;
 	msi->chip.teardown_irq = tegra_msi_teardown_irq;
 
-	msi->domain = irq_domain_add_linear(pcie->dev->of_node, INT_PCI_MSI_NR,
+	msi->domain = irq_domain_add_linear(dev->of_node, INT_PCI_MSI_NR,
 					    &msi_domain_ops, &msi->chip);
 	if (!msi->domain) {
-		dev_err(&pdev->dev, "failed to create IRQ domain\n");
+		dev_err(dev, "failed to create IRQ domain\n");
 		return -ENOMEM;
 	}
 
 	err = platform_get_irq_byname(pdev, "msi");
 	if (err < 0) {
-		dev_err(&pdev->dev, "failed to get IRQ: %d\n", err);
+		dev_err(dev, "failed to get IRQ: %d\n", err);
 		goto err;
 	}
 
@@ -1519,7 +1531,7 @@ static int tegra_pcie_enable_msi(struct tegra_pcie *pcie)
 	err = request_irq(msi->irq, tegra_pcie_msi_irq, IRQF_NO_THREAD,
 			  tegra_msi_irq_chip.name, pcie);
 	if (err < 0) {
-		dev_err(&pdev->dev, "failed to request IRQ: %d\n", err);
+		dev_err(dev, "failed to request IRQ: %d\n", err);
 		goto err;
 	}
 
@@ -1594,46 +1606,47 @@ static int tegra_pcie_disable_msi(struct tegra_pcie *pcie)
 static int tegra_pcie_get_xbar_config(struct tegra_pcie *pcie, u32 lanes,
 				      u32 *xbar)
 {
-	struct device_node *np = pcie->dev->of_node;
+	struct device *dev = pcie->dev;
+	struct device_node *np = dev->of_node;
 
 	if (of_device_is_compatible(np, "nvidia,tegra124-pcie")) {
 		switch (lanes) {
 		case 0x0000104:
-			dev_info(pcie->dev, "4x1, 1x1 configuration\n");
+			dev_info(dev, "4x1, 1x1 configuration\n");
 			*xbar = AFI_PCIE_CONFIG_SM2TMS0_XBAR_CONFIG_X4_X1;
 			return 0;
 
 		case 0x0000102:
-			dev_info(pcie->dev, "2x1, 1x1 configuration\n");
+			dev_info(dev, "2x1, 1x1 configuration\n");
 			*xbar = AFI_PCIE_CONFIG_SM2TMS0_XBAR_CONFIG_X2_X1;
 			return 0;
 		}
 	} else if (of_device_is_compatible(np, "nvidia,tegra30-pcie")) {
 		switch (lanes) {
 		case 0x00000204:
-			dev_info(pcie->dev, "4x1, 2x1 configuration\n");
+			dev_info(dev, "4x1, 2x1 configuration\n");
 			*xbar = AFI_PCIE_CONFIG_SM2TMS0_XBAR_CONFIG_420;
 			return 0;
 
 		case 0x00020202:
-			dev_info(pcie->dev, "2x3 configuration\n");
+			dev_info(dev, "2x3 configuration\n");
 			*xbar = AFI_PCIE_CONFIG_SM2TMS0_XBAR_CONFIG_222;
 			return 0;
 
 		case 0x00010104:
-			dev_info(pcie->dev, "4x1, 1x2 configuration\n");
+			dev_info(dev, "4x1, 1x2 configuration\n");
 			*xbar = AFI_PCIE_CONFIG_SM2TMS0_XBAR_CONFIG_411;
 			return 0;
 		}
 	} else if (of_device_is_compatible(np, "nvidia,tegra20-pcie")) {
 		switch (lanes) {
 		case 0x00000004:
-			dev_info(pcie->dev, "single-mode configuration\n");
+			dev_info(dev, "single-mode configuration\n");
 			*xbar = AFI_PCIE_CONFIG_SM2TMS0_XBAR_CONFIG_SINGLE;
 			return 0;
 
 		case 0x00000202:
-			dev_info(pcie->dev, "dual-mode configuration\n");
+			dev_info(dev, "dual-mode configuration\n");
 			*xbar = AFI_PCIE_CONFIG_SM2TMS0_XBAR_CONFIG_DUAL;
 			return 0;
 		}
@@ -1673,7 +1686,8 @@ static bool of_regulator_bulk_available(struct device_node *np,
  */
 static int tegra_pcie_get_legacy_regulators(struct tegra_pcie *pcie)
 {
-	struct device_node *np = pcie->dev->of_node;
+	struct device *dev = pcie->dev;
+	struct device_node *np = dev->of_node;
 
 	if (of_device_is_compatible(np, "nvidia,tegra30-pcie"))
 		pcie->num_supplies = 3;
@@ -1681,12 +1695,12 @@ static int tegra_pcie_get_legacy_regulators(struct tegra_pcie *pcie)
 		pcie->num_supplies = 2;
 
 	if (pcie->num_supplies == 0) {
-		dev_err(pcie->dev, "device %s not supported in legacy mode\n",
+		dev_err(dev, "device %s not supported in legacy mode\n",
 			np->full_name);
 		return -ENODEV;
 	}
 
-	pcie->supplies = devm_kcalloc(pcie->dev, pcie->num_supplies,
+	pcie->supplies = devm_kcalloc(dev, pcie->num_supplies,
 				      sizeof(*pcie->supplies),
 				      GFP_KERNEL);
 	if (!pcie->supplies)
@@ -1698,8 +1712,7 @@ static int tegra_pcie_get_legacy_regulators(struct tegra_pcie *pcie)
 	if (pcie->num_supplies > 2)
 		pcie->supplies[2].supply = "avdd";
 
-	return devm_regulator_bulk_get(pcie->dev, pcie->num_supplies,
-				       pcie->supplies);
+	return devm_regulator_bulk_get(dev, pcie->num_supplies, pcie->supplies);
 }
 
 /*
@@ -1713,13 +1726,14 @@ static int tegra_pcie_get_legacy_regulators(struct tegra_pcie *pcie)
  */
 static int tegra_pcie_get_regulators(struct tegra_pcie *pcie, u32 lane_mask)
 {
-	struct device_node *np = pcie->dev->of_node;
+	struct device *dev = pcie->dev;
+	struct device_node *np = dev->of_node;
 	unsigned int i = 0;
 
 	if (of_device_is_compatible(np, "nvidia,tegra124-pcie")) {
 		pcie->num_supplies = 7;
 
-		pcie->supplies = devm_kcalloc(pcie->dev, pcie->num_supplies,
+		pcie->supplies = devm_kcalloc(dev, pcie->num_supplies,
 					      sizeof(*pcie->supplies),
 					      GFP_KERNEL);
 		if (!pcie->supplies)
@@ -1746,7 +1760,7 @@ static int tegra_pcie_get_regulators(struct tegra_pcie *pcie, u32 lane_mask)
 		pcie->num_supplies = 4 + (need_pexa ? 2 : 0) +
 					 (need_pexb ? 2 : 0);
 
-		pcie->supplies = devm_kcalloc(pcie->dev, pcie->num_supplies,
+		pcie->supplies = devm_kcalloc(dev, pcie->num_supplies,
 					      sizeof(*pcie->supplies),
 					      GFP_KERNEL);
 		if (!pcie->supplies)
@@ -1769,7 +1783,7 @@ static int tegra_pcie_get_regulators(struct tegra_pcie *pcie, u32 lane_mask)
 	} else if (of_device_is_compatible(np, "nvidia,tegra20-pcie")) {
 		pcie->num_supplies = 5;
 
-		pcie->supplies = devm_kcalloc(pcie->dev, pcie->num_supplies,
+		pcie->supplies = devm_kcalloc(dev, pcie->num_supplies,
 					      sizeof(*pcie->supplies),
 					      GFP_KERNEL);
 		if (!pcie->supplies)
@@ -1782,9 +1796,9 @@ static int tegra_pcie_get_regulators(struct tegra_pcie *pcie, u32 lane_mask)
 		pcie->supplies[4].supply = "vddio-pex-clk";
 	}
 
-	if (of_regulator_bulk_available(pcie->dev->of_node, pcie->supplies,
+	if (of_regulator_bulk_available(dev->of_node, pcie->supplies,
 					pcie->num_supplies))
-		return devm_regulator_bulk_get(pcie->dev, pcie->num_supplies,
+		return devm_regulator_bulk_get(dev, pcie->num_supplies,
 					       pcie->supplies);
 
 	/*
@@ -1792,9 +1806,9 @@ static int tegra_pcie_get_regulators(struct tegra_pcie *pcie, u32 lane_mask)
 	 * that the device tree complies with an older version of the device
 	 * tree binding.
 	 */
-	dev_info(pcie->dev, "using legacy DT binding for power supplies\n");
+	dev_info(dev, "using legacy DT binding for power supplies\n");
 
-	devm_kfree(pcie->dev, pcie->supplies);
+	devm_kfree(dev, pcie->supplies);
 	pcie->num_supplies = 0;
 
 	return tegra_pcie_get_legacy_regulators(pcie);
@@ -1802,7 +1816,8 @@ static int tegra_pcie_get_regulators(struct tegra_pcie *pcie, u32 lane_mask)
 
 static int tegra_pcie_parse_dt(struct tegra_pcie *pcie)
 {
-	struct device_node *np = pcie->dev->of_node, *port;
+	struct device *dev = pcie->dev;
+	struct device_node *np = dev->of_node, *port;
 	const struct tegra_pcie_soc *soc = pcie->soc;
 	struct of_pci_range_parser parser;
 	struct of_pci_range range;
@@ -1812,7 +1827,7 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie)
 	int err;
 
 	if (of_pci_range_parser_init(&parser, np)) {
-		dev_err(pcie->dev, "missing \"ranges\" property\n");
+		dev_err(dev, "missing \"ranges\" property\n");
 		return -EINVAL;
 	}
 
@@ -1867,8 +1882,7 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie)
 
 	err = of_pci_parse_bus_range(np, &pcie->busn);
 	if (err < 0) {
-		dev_err(pcie->dev, "failed to parse ranges property: %d\n",
-			err);
+		dev_err(dev, "failed to parse ranges property: %d\n", err);
 		pcie->busn.name = np->name;
 		pcie->busn.start = 0;
 		pcie->busn.end = 0xff;
@@ -1883,15 +1897,14 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie)
 
 		err = of_pci_get_devfn(port);
 		if (err < 0) {
-			dev_err(pcie->dev, "failed to parse address: %d\n",
-				err);
+			dev_err(dev, "failed to parse address: %d\n", err);
 			return err;
 		}
 
 		index = PCI_SLOT(err);
 
 		if (index < 1 || index > soc->num_ports) {
-			dev_err(pcie->dev, "invalid port number: %d\n", index);
+			dev_err(dev, "invalid port number: %d\n", index);
 			return -EINVAL;
 		}
 
@@ -1899,13 +1912,13 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie)
 
 		err = of_property_read_u32(port, "nvidia,num-lanes", &value);
 		if (err < 0) {
-			dev_err(pcie->dev, "failed to parse # of lanes: %d\n",
+			dev_err(dev, "failed to parse # of lanes: %d\n",
 				err);
 			return err;
 		}
 
 		if (value > 16) {
-			dev_err(pcie->dev, "invalid # of lanes: %u\n", value);
+			dev_err(dev, "invalid # of lanes: %u\n", value);
 			return -EINVAL;
 		}
 
@@ -1919,14 +1932,13 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie)
 		mask |= ((1 << value) - 1) << lane;
 		lane += value;
 
-		rp = devm_kzalloc(pcie->dev, sizeof(*rp), GFP_KERNEL);
+		rp = devm_kzalloc(dev, sizeof(*rp), GFP_KERNEL);
 		if (!rp)
 			return -ENOMEM;
 
 		err = of_address_to_resource(port, 0, &rp->regs);
 		if (err < 0) {
-			dev_err(pcie->dev, "failed to parse address: %d\n",
-				err);
+			dev_err(dev, "failed to parse address: %d\n", err);
 			return err;
 		}
 
@@ -1936,7 +1948,7 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie)
 		rp->pcie = pcie;
 		rp->np = port;
 
-		rp->base = devm_ioremap_resource(pcie->dev, &rp->regs);
+		rp->base = devm_ioremap_resource(dev, &rp->regs);
 		if (IS_ERR(rp->base))
 			return PTR_ERR(rp->base);
 
@@ -1945,7 +1957,7 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie)
 
 	err = tegra_pcie_get_xbar_config(pcie, lanes, &pcie->xbar_config);
 	if (err < 0) {
-		dev_err(pcie->dev, "invalid lane configuration\n");
+		dev_err(dev, "invalid lane configuration\n");
 		return err;
 	}
 
@@ -1964,6 +1976,7 @@ static int tegra_pcie_parse_dt(struct tegra_pcie *pcie)
 #define TEGRA_PCIE_LINKUP_TIMEOUT	200	/* up to 1.2 seconds */
 static bool tegra_pcie_port_check_link(struct tegra_pcie_port *port)
 {
+	struct device *dev = port->pcie->dev;
 	unsigned int retries = 3;
 	unsigned long value;
 
@@ -1986,8 +1999,7 @@ static bool tegra_pcie_port_check_link(struct tegra_pcie_port *port)
 		} while (--timeout);
 
 		if (!timeout) {
-			dev_err(port->pcie->dev, "link %u down, retrying\n",
-				port->index);
+			dev_err(dev, "link %u down, retrying\n", port->index);
 			goto retry;
 		}
 
@@ -2011,11 +2023,12 @@ retry:
 
 static int tegra_pcie_enable(struct tegra_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	struct tegra_pcie_port *port, *tmp;
 	struct hw_pci hw;
 
 	list_for_each_entry_safe(port, tmp, &pcie->ports, list) {
-		dev_info(pcie->dev, "probing port %u, using %u lanes\n",
+		dev_info(dev, "probing port %u, using %u lanes\n",
 			 port->index, port->lanes);
 
 		tegra_pcie_port_enable(port);
@@ -2023,7 +2036,7 @@ static int tegra_pcie_enable(struct tegra_pcie *pcie)
 		if (tegra_pcie_port_check_link(port))
 			continue;
 
-		dev_info(pcie->dev, "link %u down, ignoring\n", port->index);
+		dev_info(dev, "link %u down, ignoring\n", port->index);
 
 		tegra_pcie_port_disable(port);
 		tegra_pcie_port_free(port);
@@ -2041,8 +2054,7 @@ static int tegra_pcie_enable(struct tegra_pcie *pcie)
 	hw.map_irq = tegra_pcie_map_irq;
 	hw.ops = &tegra_pcie_ops;
 
-	pci_common_init_dev(pcie->dev, &hw);
-
+	pci_common_init_dev(dev, &hw);
 	return 0;
 }
 
@@ -2204,17 +2216,18 @@ remove:
 
 static int tegra_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct tegra_pcie *pcie;
 	int err;
 
-	pcie = devm_kzalloc(&pdev->dev, sizeof(*pcie), GFP_KERNEL);
+	pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
 	if (!pcie)
 		return -ENOMEM;
 
-	pcie->soc = of_device_get_match_data(&pdev->dev);
+	pcie->soc = of_device_get_match_data(dev);
 	INIT_LIST_HEAD(&pcie->buses);
 	INIT_LIST_HEAD(&pcie->ports);
-	pcie->dev = &pdev->dev;
+	pcie->dev = dev;
 
 	err = tegra_pcie_parse_dt(pcie);
 	if (err < 0)
@@ -2222,7 +2235,7 @@ static int tegra_pcie_probe(struct platform_device *pdev)
 
 	err = tegra_pcie_get_resources(pcie);
 	if (err < 0) {
-		dev_err(&pdev->dev, "failed to request resources: %d\n", err);
+		dev_err(dev, "failed to request resources: %d\n", err);
 		return err;
 	}
 
@@ -2236,27 +2249,23 @@ static int tegra_pcie_probe(struct platform_device *pdev)
 	if (IS_ENABLED(CONFIG_PCI_MSI)) {
 		err = tegra_pcie_enable_msi(pcie);
 		if (err < 0) {
-			dev_err(&pdev->dev,
-				"failed to enable MSI support: %d\n",
-				err);
+			dev_err(dev, "failed to enable MSI support: %d\n", err);
 			goto put_resources;
 		}
 	}
 
 	err = tegra_pcie_enable(pcie);
 	if (err < 0) {
-		dev_err(&pdev->dev, "failed to enable PCIe ports: %d\n", err);
+		dev_err(dev, "failed to enable PCIe ports: %d\n", err);
 		goto disable_msi;
 	}
 
 	if (IS_ENABLED(CONFIG_DEBUG_FS)) {
 		err = tegra_pcie_debugfs_init(pcie);
 		if (err < 0)
-			dev_err(&pdev->dev, "failed to setup debugfs: %d\n",
-				err);
+			dev_err(dev, "failed to setup debugfs: %d\n", err);
 	}
 
-	platform_set_drvdata(pdev, pcie);
 	return 0;
 
 disable_msi:
diff --git a/drivers/pci/host/pci-xgene.c b/drivers/pci/host/pci-xgene.c
index a81273c23341..1de23d74783f 100644
--- a/drivers/pci/host/pci-xgene.c
+++ b/drivers/pci/host/pci-xgene.c
@@ -76,6 +76,16 @@ struct xgene_pcie_port {
 	u32			version;
 };
 
+static u32 xgene_pcie_readl(struct xgene_pcie_port *port, u32 reg)
+{
+	return readl(port->csr_base + reg);
+}
+
+static void xgene_pcie_writel(struct xgene_pcie_port *port, u32 reg, u32 val)
+{
+	writel(val, port->csr_base + reg);
+}
+
 static inline u32 pcie_bar_low_val(u32 addr, u32 flags)
 {
 	return (addr & PCI_BASE_ADDRESS_MEM_MASK) | flags;
@@ -112,9 +122,9 @@ static void xgene_pcie_set_rtdid_reg(struct pci_bus *bus, uint devfn)
 	if (!pci_is_root_bus(bus))
 		rtdid_val = (b << 8) | (d << 3) | f;
 
-	writel(rtdid_val, port->csr_base + RTDID);
+	xgene_pcie_writel(port, RTDID, rtdid_val);
 	/* read the register back to ensure flush */
-	readl(port->csr_base + RTDID);
+	xgene_pcie_readl(port, RTDID);
 }
 
 /*
@@ -179,28 +189,28 @@ static struct pci_ops xgene_pcie_ops = {
 	.write = pci_generic_config_write32,
 };
 
-static u64 xgene_pcie_set_ib_mask(void __iomem *csr_base, u32 addr,
+static u64 xgene_pcie_set_ib_mask(struct xgene_pcie_port *port, u32 addr,
 				  u32 flags, u64 size)
 {
 	u64 mask = (~(size - 1) & PCI_BASE_ADDRESS_MEM_MASK) | flags;
 	u32 val32 = 0;
 	u32 val;
 
-	val32 = readl(csr_base + addr);
+	val32 = xgene_pcie_readl(port, addr);
 	val = (val32 & 0x0000ffff) | (lower_32_bits(mask) << 16);
-	writel(val, csr_base + addr);
+	xgene_pcie_writel(port, addr, val);
 
-	val32 = readl(csr_base + addr + 0x04);
+	val32 = xgene_pcie_readl(port, addr + 0x04);
 	val = (val32 & 0xffff0000) | (lower_32_bits(mask) >> 16);
-	writel(val, csr_base + addr + 0x04);
+	xgene_pcie_writel(port, addr + 0x04, val);
 
-	val32 = readl(csr_base + addr + 0x04);
+	val32 = xgene_pcie_readl(port, addr + 0x04);
 	val = (val32 & 0x0000ffff) | (upper_32_bits(mask) << 16);
-	writel(val, csr_base + addr + 0x04);
+	xgene_pcie_writel(port, addr + 0x04, val);
 
-	val32 = readl(csr_base + addr + 0x08);
+	val32 = xgene_pcie_readl(port, addr + 0x08);
 	val = (val32 & 0xffff0000) | (upper_32_bits(mask) >> 16);
-	writel(val, csr_base + addr + 0x08);
+	xgene_pcie_writel(port, addr + 0x08, val);
 
 	return mask;
 }
@@ -208,32 +218,32 @@ static u64 xgene_pcie_set_ib_mask(void __iomem *csr_base, u32 addr,
 static void xgene_pcie_linkup(struct xgene_pcie_port *port,
 				   u32 *lanes, u32 *speed)
 {
-	void __iomem *csr_base = port->csr_base;
 	u32 val32;
 
 	port->link_up = false;
-	val32 = readl(csr_base + PCIECORE_CTLANDSTATUS);
+	val32 = xgene_pcie_readl(port, PCIECORE_CTLANDSTATUS);
 	if (val32 & LINK_UP_MASK) {
 		port->link_up = true;
 		*speed = PIPE_PHY_RATE_RD(val32);
-		val32 = readl(csr_base + BRIDGE_STATUS_0);
+		val32 = xgene_pcie_readl(port, BRIDGE_STATUS_0);
 		*lanes = val32 >> 26;
 	}
 }
 
 static int xgene_pcie_init_port(struct xgene_pcie_port *port)
 {
+	struct device *dev = port->dev;
 	int rc;
 
-	port->clk = clk_get(port->dev, NULL);
+	port->clk = clk_get(dev, NULL);
 	if (IS_ERR(port->clk)) {
-		dev_err(port->dev, "clock not available\n");
+		dev_err(dev, "clock not available\n");
 		return -ENODEV;
 	}
 
 	rc = clk_prepare_enable(port->clk);
 	if (rc) {
-		dev_err(port->dev, "clock enable failed\n");
+		dev_err(dev, "clock enable failed\n");
 		return rc;
 	}
 
@@ -243,15 +253,16 @@ static int xgene_pcie_init_port(struct xgene_pcie_port *port)
 static int xgene_pcie_map_reg(struct xgene_pcie_port *port,
 			      struct platform_device *pdev)
 {
+	struct device *dev = port->dev;
 	struct resource *res;
 
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "csr");
-	port->csr_base = devm_ioremap_resource(port->dev, res);
+	port->csr_base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(port->csr_base))
 		return PTR_ERR(port->csr_base);
 
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "cfg");
-	port->cfg_base = devm_ioremap_resource(port->dev, res);
+	port->cfg_base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(port->cfg_base))
 		return PTR_ERR(port->cfg_base);
 	port->cfg_addr = res->start;
@@ -263,7 +274,7 @@ static void xgene_pcie_setup_ob_reg(struct xgene_pcie_port *port,
 				    struct resource *res, u32 offset,
 				    u64 cpu_addr, u64 pci_addr)
 {
-	void __iomem *base = port->csr_base + offset;
+	struct device *dev = port->dev;
 	resource_size_t size = resource_size(res);
 	u64 restype = resource_type(res);
 	u64 mask = 0;
@@ -280,22 +291,24 @@ static void xgene_pcie_setup_ob_reg(struct xgene_pcie_port *port,
 	if (size >= min_size)
 		mask = ~(size - 1) | flag;
 	else
-		dev_warn(port->dev, "res size 0x%llx less than minimum 0x%x\n",
+		dev_warn(dev, "res size 0x%llx less than minimum 0x%x\n",
 			 (u64)size, min_size);
 
-	writel(lower_32_bits(cpu_addr), base);
-	writel(upper_32_bits(cpu_addr), base + 0x04);
-	writel(lower_32_bits(mask), base + 0x08);
-	writel(upper_32_bits(mask), base + 0x0c);
-	writel(lower_32_bits(pci_addr), base + 0x10);
-	writel(upper_32_bits(pci_addr), base + 0x14);
+	xgene_pcie_writel(port, offset, lower_32_bits(cpu_addr));
+	xgene_pcie_writel(port, offset + 0x04, upper_32_bits(cpu_addr));
+	xgene_pcie_writel(port, offset + 0x08, lower_32_bits(mask));
+	xgene_pcie_writel(port, offset + 0x0c, upper_32_bits(mask));
+	xgene_pcie_writel(port, offset + 0x10, lower_32_bits(pci_addr));
+	xgene_pcie_writel(port, offset + 0x14, upper_32_bits(pci_addr));
 }
 
-static void xgene_pcie_setup_cfg_reg(void __iomem *csr_base, u64 addr)
+static void xgene_pcie_setup_cfg_reg(struct xgene_pcie_port *port)
 {
-	writel(lower_32_bits(addr), csr_base + CFGBARL);
-	writel(upper_32_bits(addr), csr_base + CFGBARH);
-	writel(EN_REG, csr_base + CFGCTL);
+	u64 addr = port->cfg_addr;
+
+	xgene_pcie_writel(port, CFGBARL, lower_32_bits(addr));
+	xgene_pcie_writel(port, CFGBARH, upper_32_bits(addr));
+	xgene_pcie_writel(port, CFGCTL, EN_REG);
 }
 
 static int xgene_pcie_map_ranges(struct xgene_pcie_port *port,
@@ -310,7 +323,7 @@ static int xgene_pcie_map_ranges(struct xgene_pcie_port *port,
 		struct resource *res = window->res;
 		u64 restype = resource_type(res);
 
-		dev_dbg(port->dev, "%pR\n", res);
+		dev_dbg(dev, "%pR\n", res);
 
 		switch (restype) {
 		case IORESOURCE_IO:
@@ -339,17 +352,18 @@ static int xgene_pcie_map_ranges(struct xgene_pcie_port *port,
 			return -EINVAL;
 		}
 	}
-	xgene_pcie_setup_cfg_reg(port->csr_base, port->cfg_addr);
-
+	xgene_pcie_setup_cfg_reg(port);
 	return 0;
 }
 
-static void xgene_pcie_setup_pims(void *addr, u64 pim, u64 size)
+static void xgene_pcie_setup_pims(struct xgene_pcie_port *port, u32 pim_reg,
+				  u64 pim, u64 size)
 {
-	writel(lower_32_bits(pim), addr);
-	writel(upper_32_bits(pim) | EN_COHERENCY, addr + 0x04);
-	writel(lower_32_bits(size), addr + 0x10);
-	writel(upper_32_bits(size), addr + 0x14);
+	xgene_pcie_writel(port, pim_reg, lower_32_bits(pim));
+	xgene_pcie_writel(port, pim_reg + 0x04,
+			  upper_32_bits(pim) | EN_COHERENCY);
+	xgene_pcie_writel(port, pim_reg + 0x10, lower_32_bits(size));
+	xgene_pcie_writel(port, pim_reg + 0x14, upper_32_bits(size));
 }
 
 /*
@@ -379,10 +393,10 @@ static int xgene_pcie_select_ib_reg(u8 *ib_reg_mask, u64 size)
 static void xgene_pcie_setup_ib_reg(struct xgene_pcie_port *port,
 				    struct of_pci_range *range, u8 *ib_reg_mask)
 {
-	void __iomem *csr_base = port->csr_base;
 	void __iomem *cfg_base = port->cfg_base;
+	struct device *dev = port->dev;
 	void *bar_addr;
-	void *pim_addr;
+	u32 pim_reg;
 	u64 cpu_addr = range->cpu_addr;
 	u64 pci_addr = range->pci_addr;
 	u64 size = range->size;
@@ -393,7 +407,7 @@ static void xgene_pcie_setup_ib_reg(struct xgene_pcie_port *port,
 
 	region = xgene_pcie_select_ib_reg(ib_reg_mask, range->size);
 	if (region < 0) {
-		dev_warn(port->dev, "invalid pcie dma-range config\n");
+		dev_warn(dev, "invalid pcie dma-range config\n");
 		return;
 	}
 
@@ -403,29 +417,27 @@ static void xgene_pcie_setup_ib_reg(struct xgene_pcie_port *port,
 	bar_low = pcie_bar_low_val((u32)cpu_addr, flags);
 	switch (region) {
 	case 0:
-		xgene_pcie_set_ib_mask(csr_base, BRIDGE_CFG_4, flags, size);
+		xgene_pcie_set_ib_mask(port, BRIDGE_CFG_4, flags, size);
 		bar_addr = cfg_base + PCI_BASE_ADDRESS_0;
 		writel(bar_low, bar_addr);
 		writel(upper_32_bits(cpu_addr), bar_addr + 0x4);
-		pim_addr = csr_base + PIM1_1L;
+		pim_reg = PIM1_1L;
 		break;
 	case 1:
-		bar_addr = csr_base + IBAR2;
-		writel(bar_low, bar_addr);
-		writel(lower_32_bits(mask), csr_base + IR2MSK);
-		pim_addr = csr_base + PIM2_1L;
+		xgene_pcie_writel(port, IBAR2, bar_low);
+		xgene_pcie_writel(port, IR2MSK, lower_32_bits(mask));
+		pim_reg = PIM2_1L;
 		break;
 	case 2:
-		bar_addr = csr_base + IBAR3L;
-		writel(bar_low, bar_addr);
-		writel(upper_32_bits(cpu_addr), bar_addr + 0x4);
-		writel(lower_32_bits(mask), csr_base + IR3MSKL);
-		writel(upper_32_bits(mask), csr_base + IR3MSKL + 0x4);
-		pim_addr = csr_base + PIM3_1L;
+		xgene_pcie_writel(port, IBAR3L, bar_low);
+		xgene_pcie_writel(port, IBAR3L + 0x4, upper_32_bits(cpu_addr));
+		xgene_pcie_writel(port, IR3MSKL, lower_32_bits(mask));
+		xgene_pcie_writel(port, IR3MSKL + 0x4, upper_32_bits(mask));
+		pim_reg = PIM3_1L;
 		break;
 	}
 
-	xgene_pcie_setup_pims(pim_addr, pci_addr, ~(size - 1));
+	xgene_pcie_setup_pims(port, pim_reg, pci_addr, ~(size - 1));
 }
 
 static int pci_dma_range_parser_init(struct of_pci_range_parser *parser,
@@ -463,7 +475,7 @@ static int xgene_pcie_parse_map_dma_ranges(struct xgene_pcie_port *port)
 	for_each_of_pci_range(&parser, &range) {
 		u64 end = range.cpu_addr + range.size - 1;
 
-		dev_dbg(port->dev, "0x%08x 0x%016llx..0x%016llx -> 0x%016llx\n",
+		dev_dbg(dev, "0x%08x 0x%016llx..0x%016llx -> 0x%016llx\n",
 			range.flags, range.cpu_addr, end, range.pci_addr);
 		xgene_pcie_setup_ib_reg(port, &range, &ib_reg_mask);
 	}
@@ -476,13 +488,14 @@ static void xgene_pcie_clear_config(struct xgene_pcie_port *port)
 	int i;
 
 	for (i = PIM1_1L; i <= CFGCTL; i += 4)
-		writel(0x0, port->csr_base + i);
+		xgene_pcie_writel(port, i, 0);
 }
 
 static int xgene_pcie_setup(struct xgene_pcie_port *port,
 			    struct list_head *res,
 			    resource_size_t io_base)
 {
+	struct device *dev = port->dev;
 	u32 val, lanes = 0, speed = 0;
 	int ret;
 
@@ -490,7 +503,7 @@ static int xgene_pcie_setup(struct xgene_pcie_port *port,
 
 	/* setup the vendor and device IDs correctly */
 	val = (XGENE_PCIE_DEVICEID << 16) | XGENE_PCIE_VENDORID;
-	writel(val, port->csr_base + BRIDGE_CFG_0);
+	xgene_pcie_writel(port, BRIDGE_CFG_0, val);
 
 	ret = xgene_pcie_map_ranges(port, res, io_base);
 	if (ret)
@@ -502,27 +515,28 @@ static int xgene_pcie_setup(struct xgene_pcie_port *port,
 
 	xgene_pcie_linkup(port, &lanes, &speed);
 	if (!port->link_up)
-		dev_info(port->dev, "(rc) link down\n");
+		dev_info(dev, "(rc) link down\n");
 	else
-		dev_info(port->dev, "(rc) x%d gen-%d link up\n",
-				lanes, speed + 1);
+		dev_info(dev, "(rc) x%d gen-%d link up\n", lanes, speed + 1);
 	return 0;
 }
 
 static int xgene_pcie_probe_bridge(struct platform_device *pdev)
 {
-	struct device_node *dn = pdev->dev.of_node;
+	struct device *dev = &pdev->dev;
+	struct device_node *dn = dev->of_node;
 	struct xgene_pcie_port *port;
 	resource_size_t iobase = 0;
 	struct pci_bus *bus;
 	int ret;
 	LIST_HEAD(res);
 
-	port = devm_kzalloc(&pdev->dev, sizeof(*port), GFP_KERNEL);
+	port = devm_kzalloc(dev, sizeof(*port), GFP_KERNEL);
 	if (!port)
 		return -ENOMEM;
-	port->node = of_node_get(pdev->dev.of_node);
-	port->dev = &pdev->dev;
+
+	port->node = of_node_get(dn);
+	port->dev = dev;
 
 	port->version = XGENE_PCIE_IP_VER_UNKN;
 	if (of_device_is_compatible(port->node, "apm,xgene-pcie"))
@@ -540,7 +554,7 @@ static int xgene_pcie_probe_bridge(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	ret = devm_request_pci_bus_resources(&pdev->dev, &res);
+	ret = devm_request_pci_bus_resources(dev, &res);
 	if (ret)
 		goto error;
 
@@ -548,8 +562,7 @@ static int xgene_pcie_probe_bridge(struct platform_device *pdev)
 	if (ret)
 		goto error;
 
-	bus = pci_create_root_bus(&pdev->dev, 0,
-					&xgene_pcie_ops, port, &res);
+	bus = pci_create_root_bus(dev, 0, &xgene_pcie_ops, port, &res);
 	if (!bus) {
 		ret = -ENOMEM;
 		goto error;
@@ -558,8 +571,6 @@ static int xgene_pcie_probe_bridge(struct platform_device *pdev)
 	pci_scan_child_bus(bus);
 	pci_assign_unassigned_bus_resources(bus);
 	pci_bus_add_devices(bus);
-
-	platform_set_drvdata(pdev, port);
 	return 0;
 
 error:
diff --git a/drivers/pci/host/pcie-altera.c b/drivers/pci/host/pcie-altera.c
index c24e96559cbb..b0ac4dfafa0b 100644
--- a/drivers/pci/host/pcie-altera.c
+++ b/drivers/pci/host/pcie-altera.c
@@ -55,15 +55,19 @@
 #define TLP_PAYLOAD_SIZE		0x01
 #define TLP_READ_TAG			0x1d
 #define TLP_WRITE_TAG			0x10
-#define TLP_CFG_DW0(fmttype)		(((fmttype) << 24) | TLP_PAYLOAD_SIZE)
-#define TLP_CFG_DW1(reqid, tag, be)	(((reqid) << 16) | (tag << 8) | (be))
+#define RP_DEVFN			0
+#define TLP_REQ_ID(bus, devfn)		(((bus) << 8) | (devfn))
+#define TLP_CFG_DW0(pcie, bus)						\
+    ((((bus == pcie->root_bus_nr) ? TLP_FMTTYPE_CFGRD0			\
+				    : TLP_FMTTYPE_CFGRD1) << 24) |	\
+     TLP_PAYLOAD_SIZE)
+#define TLP_CFG_DW1(pcie, tag, be)	\
+    (((TLP_REQ_ID(pcie->root_bus_nr,  RP_DEVFN)) << 16) | (tag << 8) | (be))
 #define TLP_CFG_DW2(bus, devfn, offset)	\
 				(((bus) << 24) | ((devfn) << 16) | (offset))
-#define TLP_REQ_ID(bus, devfn)		(((bus) << 8) | (devfn))
 #define TLP_COMP_STATUS(s)		(((s) >> 12) & 7)
 #define TLP_HDR_SIZE			3
 #define TLP_LOOP			500
-#define RP_DEVFN			0
 
 #define LINK_UP_TIMEOUT			HZ
 #define LINK_RETRAIN_TIMEOUT		HZ
@@ -74,7 +78,7 @@
 
 struct altera_pcie {
 	struct platform_device	*pdev;
-	void __iomem		*cra_base;
+	void __iomem		*cra_base;	/* DT Cra */
 	int			irq;
 	u8			root_bus_nr;
 	struct irq_domain	*irq_domain;
@@ -131,7 +135,7 @@ static void tlp_write_tx(struct altera_pcie *pcie,
 	cra_writel(pcie, tlp_rp_regdata->ctrl, RP_TX_CNTRL);
 }
 
-static bool altera_pcie_valid_config(struct altera_pcie *pcie,
+static bool altera_pcie_valid_device(struct altera_pcie *pcie,
 				     struct pci_bus *bus, int dev)
 {
 	/* If there is no link, then there is no device */
@@ -218,13 +222,8 @@ static int tlp_cfg_dword_read(struct altera_pcie *pcie, u8 bus, u32 devfn,
 {
 	u32 headers[TLP_HDR_SIZE];
 
-	if (bus == pcie->root_bus_nr)
-		headers[0] = TLP_CFG_DW0(TLP_FMTTYPE_CFGRD0);
-	else
-		headers[0] = TLP_CFG_DW0(TLP_FMTTYPE_CFGRD1);
-
-	headers[1] = TLP_CFG_DW1(TLP_REQ_ID(pcie->root_bus_nr, RP_DEVFN),
-					TLP_READ_TAG, byte_en);
+	headers[0] = TLP_CFG_DW0(pcie, bus);
+	headers[1] = TLP_CFG_DW1(pcie, TLP_READ_TAG, byte_en);
 	headers[2] = TLP_CFG_DW2(bus, devfn, where);
 
 	tlp_write_packet(pcie, headers, 0, false);
@@ -238,13 +237,8 @@ static int tlp_cfg_dword_write(struct altera_pcie *pcie, u8 bus, u32 devfn,
 	u32 headers[TLP_HDR_SIZE];
 	int ret;
 
-	if (bus == pcie->root_bus_nr)
-		headers[0] = TLP_CFG_DW0(TLP_FMTTYPE_CFGWR0);
-	else
-		headers[0] = TLP_CFG_DW0(TLP_FMTTYPE_CFGWR1);
-
-	headers[1] = TLP_CFG_DW1(TLP_REQ_ID(pcie->root_bus_nr, RP_DEVFN),
-					TLP_WRITE_TAG, byte_en);
+	headers[0] = TLP_CFG_DW0(pcie, bus);
+	headers[1] = TLP_CFG_DW1(pcie, TLP_WRITE_TAG, byte_en);
 	headers[2] = TLP_CFG_DW2(bus, devfn, where);
 
 	/* check alignment to Qword */
@@ -342,7 +336,7 @@ static int altera_pcie_cfg_read(struct pci_bus *bus, unsigned int devfn,
 	if (altera_pcie_hide_rc_bar(bus, devfn, where))
 		return PCIBIOS_BAD_REGISTER_NUMBER;
 
-	if (!altera_pcie_valid_config(pcie, bus, PCI_SLOT(devfn))) {
+	if (!altera_pcie_valid_device(pcie, bus, PCI_SLOT(devfn))) {
 		*value = 0xffffffff;
 		return PCIBIOS_DEVICE_NOT_FOUND;
 	}
@@ -359,7 +353,7 @@ static int altera_pcie_cfg_write(struct pci_bus *bus, unsigned int devfn,
 	if (altera_pcie_hide_rc_bar(bus, devfn, where))
 		return PCIBIOS_BAD_REGISTER_NUMBER;
 
-	if (!altera_pcie_valid_config(pcie, bus, PCI_SLOT(devfn)))
+	if (!altera_pcie_valid_device(pcie, bus, PCI_SLOT(devfn)))
 		return PCIBIOS_DEVICE_NOT_FOUND;
 
 	return _altera_pcie_cfg_write(pcie, bus->number, devfn, where, size,
@@ -394,6 +388,7 @@ static int altera_write_cap_word(struct altera_pcie *pcie, u8 busno,
 
 static void altera_wait_link_retrain(struct altera_pcie *pcie)
 {
+	struct device *dev = &pcie->pdev->dev;
 	u16 reg16;
 	unsigned long start_jiffies;
 
@@ -406,7 +401,7 @@ static void altera_wait_link_retrain(struct altera_pcie *pcie)
 			break;
 
 		if (time_after(jiffies, start_jiffies + LINK_RETRAIN_TIMEOUT)) {
-			dev_err(&pcie->pdev->dev, "link retrain timeout\n");
+			dev_err(dev, "link retrain timeout\n");
 			break;
 		}
 		udelay(100);
@@ -419,7 +414,7 @@ static void altera_wait_link_retrain(struct altera_pcie *pcie)
 			break;
 
 		if (time_after(jiffies, start_jiffies + LINK_UP_TIMEOUT)) {
-			dev_err(&pcie->pdev->dev, "link up timeout\n");
+			dev_err(dev, "link up timeout\n");
 			break;
 		}
 		udelay(100);
@@ -460,7 +455,6 @@ static int altera_pcie_intx_map(struct irq_domain *domain, unsigned int irq,
 {
 	irq_set_chip_and_handler(irq, &dummy_irq_chip, handle_simple_irq);
 	irq_set_chip_data(irq, domain->host_data);
-
 	return 0;
 }
 
@@ -472,12 +466,14 @@ static void altera_pcie_isr(struct irq_desc *desc)
 {
 	struct irq_chip *chip = irq_desc_get_chip(desc);
 	struct altera_pcie *pcie;
+	struct device *dev;
 	unsigned long status;
 	u32 bit;
 	u32 virq;
 
 	chained_irq_enter(chip, desc);
 	pcie = irq_desc_get_handler_data(desc);
+	dev = &pcie->pdev->dev;
 
 	while ((status = cra_readl(pcie, P2A_INT_STATUS)
 		& P2A_INT_STS_ALL) != 0) {
@@ -489,8 +485,7 @@ static void altera_pcie_isr(struct irq_desc *desc)
 			if (virq)
 				generic_handle_irq(virq);
 			else
-				dev_err(&pcie->pdev->dev,
-					"unexpected IRQ, INT%d\n", bit);
+				dev_err(dev, "unexpected IRQ, INT%d\n", bit);
 		}
 	}
 
@@ -549,30 +544,25 @@ static int altera_pcie_init_irq_domain(struct altera_pcie *pcie)
 
 static int altera_pcie_parse_dt(struct altera_pcie *pcie)
 {
-	struct resource *cra;
+	struct device *dev = &pcie->pdev->dev;
 	struct platform_device *pdev = pcie->pdev;
+	struct resource *cra;
 
 	cra = platform_get_resource_byname(pdev, IORESOURCE_MEM, "Cra");
-	if (!cra) {
-		dev_err(&pdev->dev, "no Cra memory resource defined\n");
-		return -ENODEV;
-	}
-
-	pcie->cra_base = devm_ioremap_resource(&pdev->dev, cra);
+	pcie->cra_base = devm_ioremap_resource(dev, cra);
 	if (IS_ERR(pcie->cra_base)) {
-		dev_err(&pdev->dev, "failed to map cra memory\n");
+		dev_err(dev, "failed to map cra memory\n");
 		return PTR_ERR(pcie->cra_base);
 	}
 
 	/* setup IRQ */
 	pcie->irq = platform_get_irq(pdev, 0);
 	if (pcie->irq <= 0) {
-		dev_err(&pdev->dev, "failed to get IRQ: %d\n", pcie->irq);
+		dev_err(dev, "failed to get IRQ: %d\n", pcie->irq);
 		return -EINVAL;
 	}
 
 	irq_set_chained_handler_and_data(pcie->irq, altera_pcie_isr, pcie);
-
 	return 0;
 }
 
@@ -583,12 +573,13 @@ static void altera_pcie_host_init(struct altera_pcie *pcie)
 
 static int altera_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct altera_pcie *pcie;
 	struct pci_bus *bus;
 	struct pci_bus *child;
 	int ret;
 
-	pcie = devm_kzalloc(&pdev->dev, sizeof(*pcie), GFP_KERNEL);
+	pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
 	if (!pcie)
 		return -ENOMEM;
 
@@ -596,7 +587,7 @@ static int altera_pcie_probe(struct platform_device *pdev)
 
 	ret = altera_pcie_parse_dt(pcie);
 	if (ret) {
-		dev_err(&pdev->dev, "Parsing DT failed\n");
+		dev_err(dev, "Parsing DT failed\n");
 		return ret;
 	}
 
@@ -604,13 +595,13 @@ static int altera_pcie_probe(struct platform_device *pdev)
 
 	ret = altera_pcie_parse_request_of_pci_ranges(pcie);
 	if (ret) {
-		dev_err(&pdev->dev, "Failed add resources\n");
+		dev_err(dev, "Failed add resources\n");
 		return ret;
 	}
 
 	ret = altera_pcie_init_irq_domain(pcie);
 	if (ret) {
-		dev_err(&pdev->dev, "Failed creating IRQ Domain\n");
+		dev_err(dev, "Failed creating IRQ Domain\n");
 		return ret;
 	}
 
@@ -620,7 +611,7 @@ static int altera_pcie_probe(struct platform_device *pdev)
 	cra_writel(pcie, P2A_INT_ENA_ALL, P2A_INT_ENABLE);
 	altera_pcie_host_init(pcie);
 
-	bus = pci_scan_root_bus(&pdev->dev, pcie->root_bus_nr, &altera_pcie_ops,
+	bus = pci_scan_root_bus(dev, pcie->root_bus_nr, &altera_pcie_ops,
 				pcie, &pcie->resources);
 	if (!bus)
 		return -ENOMEM;
@@ -633,8 +624,6 @@ static int altera_pcie_probe(struct platform_device *pdev)
 		pcie_bus_configure_settings(child);
 
 	pci_bus_add_devices(bus);
-
-	platform_set_drvdata(pdev, pcie);
 	return ret;
 }
 
diff --git a/drivers/pci/host/pcie-armada8k.c b/drivers/pci/host/pcie-armada8k.c
index 0f4f570068e3..0ac0f18690f2 100644
--- a/drivers/pci/host/pcie-armada8k.c
+++ b/drivers/pci/host/pcie-armada8k.c
@@ -29,34 +29,33 @@
 #include "pcie-designware.h"
 
 struct armada8k_pcie {
-	void __iomem *base;
+	struct pcie_port pp;		/* pp.dbi_base is DT ctrl */
 	struct clk *clk;
-	struct pcie_port pp;
 };
 
 #define PCIE_VENDOR_REGS_OFFSET		0x8000
 
-#define PCIE_GLOBAL_CONTROL_REG		0x0
+#define PCIE_GLOBAL_CONTROL_REG		(PCIE_VENDOR_REGS_OFFSET + 0x0)
 #define PCIE_APP_LTSSM_EN		BIT(2)
 #define PCIE_DEVICE_TYPE_SHIFT		4
 #define PCIE_DEVICE_TYPE_MASK		0xF
 #define PCIE_DEVICE_TYPE_RC		0x4 /* Root complex */
 
-#define PCIE_GLOBAL_STATUS_REG		0x8
+#define PCIE_GLOBAL_STATUS_REG		(PCIE_VENDOR_REGS_OFFSET + 0x8)
 #define PCIE_GLB_STS_RDLH_LINK_UP	BIT(1)
 #define PCIE_GLB_STS_PHY_LINK_UP	BIT(9)
 
-#define PCIE_GLOBAL_INT_CAUSE1_REG	0x1C
-#define PCIE_GLOBAL_INT_MASK1_REG	0x20
+#define PCIE_GLOBAL_INT_CAUSE1_REG	(PCIE_VENDOR_REGS_OFFSET + 0x1C)
+#define PCIE_GLOBAL_INT_MASK1_REG	(PCIE_VENDOR_REGS_OFFSET + 0x20)
 #define PCIE_INT_A_ASSERT_MASK		BIT(9)
 #define PCIE_INT_B_ASSERT_MASK		BIT(10)
 #define PCIE_INT_C_ASSERT_MASK		BIT(11)
 #define PCIE_INT_D_ASSERT_MASK		BIT(12)
 
-#define PCIE_ARCACHE_TRC_REG		0x50
-#define PCIE_AWCACHE_TRC_REG		0x54
-#define PCIE_ARUSER_REG			0x5C
-#define PCIE_AWUSER_REG			0x60
+#define PCIE_ARCACHE_TRC_REG		(PCIE_VENDOR_REGS_OFFSET + 0x50)
+#define PCIE_AWCACHE_TRC_REG		(PCIE_VENDOR_REGS_OFFSET + 0x54)
+#define PCIE_ARUSER_REG			(PCIE_VENDOR_REGS_OFFSET + 0x5C)
+#define PCIE_AWUSER_REG			(PCIE_VENDOR_REGS_OFFSET + 0x60)
 /*
  * AR/AW Cache defauls: Normal memory, Write-Back, Read / Write
  * allocate
@@ -72,11 +71,10 @@ struct armada8k_pcie {
 
 static int armada8k_pcie_link_up(struct pcie_port *pp)
 {
-	struct armada8k_pcie *pcie = to_armada8k_pcie(pp);
 	u32 reg;
 	u32 mask = PCIE_GLB_STS_RDLH_LINK_UP | PCIE_GLB_STS_PHY_LINK_UP;
 
-	reg = readl(pcie->base + PCIE_GLOBAL_STATUS_REG);
+	reg = dw_pcie_readl_rc(pp, PCIE_GLOBAL_STATUS_REG);
 
 	if ((reg & mask) == mask)
 		return 1;
@@ -85,51 +83,50 @@ static int armada8k_pcie_link_up(struct pcie_port *pp)
 	return 0;
 }
 
-static void armada8k_pcie_establish_link(struct pcie_port *pp)
+static void armada8k_pcie_establish_link(struct armada8k_pcie *pcie)
 {
-	struct armada8k_pcie *pcie = to_armada8k_pcie(pp);
-	void __iomem *base = pcie->base;
+	struct pcie_port *pp = &pcie->pp;
 	u32 reg;
 
 	if (!dw_pcie_link_up(pp)) {
 		/* Disable LTSSM state machine to enable configuration */
-		reg = readl(base + PCIE_GLOBAL_CONTROL_REG);
+		reg = dw_pcie_readl_rc(pp, PCIE_GLOBAL_CONTROL_REG);
 		reg &= ~(PCIE_APP_LTSSM_EN);
-		writel(reg, base + PCIE_GLOBAL_CONTROL_REG);
+		dw_pcie_writel_rc(pp, PCIE_GLOBAL_CONTROL_REG, reg);
 	}
 
 	/* Set the device to root complex mode */
-	reg = readl(base + PCIE_GLOBAL_CONTROL_REG);
+	reg = dw_pcie_readl_rc(pp, PCIE_GLOBAL_CONTROL_REG);
 	reg &= ~(PCIE_DEVICE_TYPE_MASK << PCIE_DEVICE_TYPE_SHIFT);
 	reg |= PCIE_DEVICE_TYPE_RC << PCIE_DEVICE_TYPE_SHIFT;
-	writel(reg, base + PCIE_GLOBAL_CONTROL_REG);
+	dw_pcie_writel_rc(pp, PCIE_GLOBAL_CONTROL_REG, reg);
 
 	/* Set the PCIe master AxCache attributes */
-	writel(ARCACHE_DEFAULT_VALUE, base + PCIE_ARCACHE_TRC_REG);
-	writel(AWCACHE_DEFAULT_VALUE, base + PCIE_AWCACHE_TRC_REG);
+	dw_pcie_writel_rc(pp, PCIE_ARCACHE_TRC_REG, ARCACHE_DEFAULT_VALUE);
+	dw_pcie_writel_rc(pp, PCIE_AWCACHE_TRC_REG, AWCACHE_DEFAULT_VALUE);
 
 	/* Set the PCIe master AxDomain attributes */
-	reg = readl(base + PCIE_ARUSER_REG);
+	reg = dw_pcie_readl_rc(pp, PCIE_ARUSER_REG);
 	reg &= ~(AX_USER_DOMAIN_MASK << AX_USER_DOMAIN_SHIFT);
 	reg |= DOMAIN_OUTER_SHAREABLE << AX_USER_DOMAIN_SHIFT;
-	writel(reg, base + PCIE_ARUSER_REG);
+	dw_pcie_writel_rc(pp, PCIE_ARUSER_REG, reg);
 
-	reg = readl(base + PCIE_AWUSER_REG);
+	reg = dw_pcie_readl_rc(pp, PCIE_AWUSER_REG);
 	reg &= ~(AX_USER_DOMAIN_MASK << AX_USER_DOMAIN_SHIFT);
 	reg |= DOMAIN_OUTER_SHAREABLE << AX_USER_DOMAIN_SHIFT;
-	writel(reg, base + PCIE_AWUSER_REG);
+	dw_pcie_writel_rc(pp, PCIE_AWUSER_REG, reg);
 
 	/* Enable INT A-D interrupts */
-	reg = readl(base + PCIE_GLOBAL_INT_MASK1_REG);
+	reg = dw_pcie_readl_rc(pp, PCIE_GLOBAL_INT_MASK1_REG);
 	reg |= PCIE_INT_A_ASSERT_MASK | PCIE_INT_B_ASSERT_MASK |
 	       PCIE_INT_C_ASSERT_MASK | PCIE_INT_D_ASSERT_MASK;
-	writel(reg, base + PCIE_GLOBAL_INT_MASK1_REG);
+	dw_pcie_writel_rc(pp, PCIE_GLOBAL_INT_MASK1_REG, reg);
 
 	if (!dw_pcie_link_up(pp)) {
 		/* Configuration done. Start LTSSM */
-		reg = readl(base + PCIE_GLOBAL_CONTROL_REG);
+		reg = dw_pcie_readl_rc(pp, PCIE_GLOBAL_CONTROL_REG);
 		reg |= PCIE_APP_LTSSM_EN;
-		writel(reg, base + PCIE_GLOBAL_CONTROL_REG);
+		dw_pcie_writel_rc(pp, PCIE_GLOBAL_CONTROL_REG, reg);
 	}
 
 	/* Wait until the link becomes active again */
@@ -139,15 +136,16 @@ static void armada8k_pcie_establish_link(struct pcie_port *pp)
 
 static void armada8k_pcie_host_init(struct pcie_port *pp)
 {
+	struct armada8k_pcie *pcie = to_armada8k_pcie(pp);
+
 	dw_pcie_setup_rc(pp);
-	armada8k_pcie_establish_link(pp);
+	armada8k_pcie_establish_link(pcie);
 }
 
 static irqreturn_t armada8k_pcie_irq_handler(int irq, void *arg)
 {
-	struct pcie_port *pp = arg;
-	struct armada8k_pcie *pcie = to_armada8k_pcie(pp);
-	void __iomem *base = pcie->base;
+	struct armada8k_pcie *pcie = arg;
+	struct pcie_port *pp = &pcie->pp;
 	u32 val;
 
 	/*
@@ -155,8 +153,8 @@ static irqreturn_t armada8k_pcie_irq_handler(int irq, void *arg)
 	 * PCI device. However, they are also latched into the PCIe
 	 * controller, so we simply discard them.
 	 */
-	val = readl(base + PCIE_GLOBAL_INT_CAUSE1_REG);
-	writel(val, base + PCIE_GLOBAL_INT_CAUSE1_REG);
+	val = dw_pcie_readl_rc(pp, PCIE_GLOBAL_INT_CAUSE1_REG);
+	dw_pcie_writel_rc(pp, PCIE_GLOBAL_INT_CAUSE1_REG, val);
 
 	return IRQ_HANDLED;
 }
@@ -166,9 +164,10 @@ static struct pcie_host_ops armada8k_pcie_host_ops = {
 	.host_init = armada8k_pcie_host_init,
 };
 
-static int armada8k_add_pcie_port(struct pcie_port *pp,
+static int armada8k_add_pcie_port(struct armada8k_pcie *pcie,
 				  struct platform_device *pdev)
 {
+	struct pcie_port *pp = &pcie->pp;
 	struct device *dev = &pdev->dev;
 	int ret;
 
@@ -182,7 +181,7 @@ static int armada8k_add_pcie_port(struct pcie_port *pp,
 	}
 
 	ret = devm_request_irq(dev, pp->irq, armada8k_pcie_irq_handler,
-			       IRQF_SHARED, "armada8k-pcie", pp);
+			       IRQF_SHARED, "armada8k-pcie", pcie);
 	if (ret) {
 		dev_err(dev, "failed to request irq %d\n", pp->irq);
 		return ret;
@@ -217,7 +216,6 @@ static int armada8k_pcie_probe(struct platform_device *pdev)
 
 	pp = &pcie->pp;
 	pp->dev = dev;
-	platform_set_drvdata(pdev, pcie);
 
 	/* Get the dw-pcie unit configuration/control registers base. */
 	base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "ctrl");
@@ -228,9 +226,7 @@ static int armada8k_pcie_probe(struct platform_device *pdev)
 		goto fail;
 	}
 
-	pcie->base = pp->dbi_base + PCIE_VENDOR_REGS_OFFSET;
-
-	ret = armada8k_add_pcie_port(pp, pdev);
+	ret = armada8k_add_pcie_port(pcie, pdev);
 	if (ret)
 		goto fail;
 
diff --git a/drivers/pci/host/pcie-artpec6.c b/drivers/pci/host/pcie-artpec6.c
index 39bf1a6df463..212786b27f1a 100644
--- a/drivers/pci/host/pcie-artpec6.c
+++ b/drivers/pci/host/pcie-artpec6.c
@@ -27,9 +27,9 @@
 #define to_artpec6_pcie(x)	container_of(x, struct artpec6_pcie, pp)
 
 struct artpec6_pcie {
-	struct pcie_port	pp;
-	struct regmap		*regmap;
-	void __iomem		*phy_base;
+	struct pcie_port	pp;		/* pp.dbi_base is DT dbi */
+	struct regmap		*regmap;	/* DT axis,syscon-pcie */
+	void __iomem		*phy_base;	/* DT phy */
 };
 
 /* PCIe Port Logic registers (memory-mapped) */
@@ -65,18 +65,31 @@ struct artpec6_pcie {
 
 #define ARTPEC6_CPU_TO_BUS_ADDR		0x0fffffff
 
-static int artpec6_pcie_establish_link(struct pcie_port *pp)
+static u32 artpec6_pcie_readl(struct artpec6_pcie *artpec6_pcie, u32 offset)
 {
-	struct artpec6_pcie *artpec6_pcie = to_artpec6_pcie(pp);
+	u32 val;
+
+	regmap_read(artpec6_pcie->regmap, offset, &val);
+	return val;
+}
+
+static void artpec6_pcie_writel(struct artpec6_pcie *artpec6_pcie, u32 offset, u32 val)
+{
+	regmap_write(artpec6_pcie->regmap, offset, val);
+}
+
+static int artpec6_pcie_establish_link(struct artpec6_pcie *artpec6_pcie)
+{
+	struct pcie_port *pp = &artpec6_pcie->pp;
 	u32 val;
 	unsigned int retries;
 
 	/* Hold DW core in reset */
-	regmap_read(artpec6_pcie->regmap, PCIECFG, &val);
+	val = artpec6_pcie_readl(artpec6_pcie, PCIECFG);
 	val |= PCIECFG_CORE_RESET_REQ;
-	regmap_write(artpec6_pcie->regmap, PCIECFG, val);
+	artpec6_pcie_writel(artpec6_pcie, PCIECFG, val);
 
-	regmap_read(artpec6_pcie->regmap, PCIECFG, &val);
+	val = artpec6_pcie_readl(artpec6_pcie, PCIECFG);
 	val |=  PCIECFG_RISRCREN |	/* Receiver term. 50 Ohm */
 		PCIECFG_MODE_TX_DRV_EN |
 		PCIECFG_CISRREN |	/* Reference clock term. 100 Ohm */
@@ -84,27 +97,27 @@ static int artpec6_pcie_establish_link(struct pcie_port *pp)
 	val |= PCIECFG_REFCLK_ENABLE;
 	val &= ~PCIECFG_DBG_OEN;
 	val &= ~PCIECFG_CLKREQ_B;
-	regmap_write(artpec6_pcie->regmap, PCIECFG, val);
+	artpec6_pcie_writel(artpec6_pcie, PCIECFG, val);
 	usleep_range(5000, 6000);
 
-	regmap_read(artpec6_pcie->regmap, NOCCFG, &val);
+	val = artpec6_pcie_readl(artpec6_pcie, NOCCFG);
 	val |= NOCCFG_ENABLE_CLK_PCIE;
-	regmap_write(artpec6_pcie->regmap, NOCCFG, val);
+	artpec6_pcie_writel(artpec6_pcie, NOCCFG, val);
 	usleep_range(20, 30);
 
-	regmap_read(artpec6_pcie->regmap, PCIECFG, &val);
+	val = artpec6_pcie_readl(artpec6_pcie, PCIECFG);
 	val |= PCIECFG_PCLK_ENABLE | PCIECFG_PLL_ENABLE;
-	regmap_write(artpec6_pcie->regmap, PCIECFG, val);
+	artpec6_pcie_writel(artpec6_pcie, PCIECFG, val);
 	usleep_range(6000, 7000);
 
-	regmap_read(artpec6_pcie->regmap, NOCCFG, &val);
+	val = artpec6_pcie_readl(artpec6_pcie, NOCCFG);
 	val &= ~NOCCFG_POWER_PCIE_IDLEREQ;
-	regmap_write(artpec6_pcie->regmap, NOCCFG, val);
+	artpec6_pcie_writel(artpec6_pcie, NOCCFG, val);
 
 	retries = 50;
 	do {
 		usleep_range(1000, 2000);
-		regmap_read(artpec6_pcie->regmap, NOCCFG, &val);
+		val = artpec6_pcie_readl(artpec6_pcie, NOCCFG);
 		retries--;
 	} while (retries &&
 		(val & (NOCCFG_POWER_PCIE_IDLEACK | NOCCFG_POWER_PCIE_IDLE)));
@@ -117,16 +130,16 @@ static int artpec6_pcie_establish_link(struct pcie_port *pp)
 	} while (retries && !(val & PHY_COSPLLLOCK));
 
 	/* Take DW core out of reset */
-	regmap_read(artpec6_pcie->regmap, PCIECFG, &val);
+	val = artpec6_pcie_readl(artpec6_pcie, PCIECFG);
 	val &= ~PCIECFG_CORE_RESET_REQ;
-	regmap_write(artpec6_pcie->regmap, PCIECFG, val);
+	artpec6_pcie_writel(artpec6_pcie, PCIECFG, val);
 	usleep_range(100, 200);
 
 	/*
 	 * Enable writing to config regs. This is required as the Synopsys
 	 * driver changes the class code. That register needs DBI write enable.
 	 */
-	writel(DBI_RO_WR_EN, pp->dbi_base + MISC_CONTROL_1_OFF);
+	dw_pcie_writel_rc(pp, MISC_CONTROL_1_OFF, DBI_RO_WR_EN);
 
 	pp->io_base &= ARTPEC6_CPU_TO_BUS_ADDR;
 	pp->mem_base &= ARTPEC6_CPU_TO_BUS_ADDR;
@@ -137,78 +150,69 @@ static int artpec6_pcie_establish_link(struct pcie_port *pp)
 	dw_pcie_setup_rc(pp);
 
 	/* assert LTSSM enable */
-	regmap_read(artpec6_pcie->regmap, PCIECFG, &val);
+	val = artpec6_pcie_readl(artpec6_pcie, PCIECFG);
 	val |= PCIECFG_LTSSM_ENABLE;
-	regmap_write(artpec6_pcie->regmap, PCIECFG, val);
+	artpec6_pcie_writel(artpec6_pcie, PCIECFG, val);
 
 	/* check if the link is up or not */
 	if (!dw_pcie_wait_for_link(pp))
 		return 0;
 
 	dev_dbg(pp->dev, "DEBUG_R0: 0x%08x, DEBUG_R1: 0x%08x\n",
-		readl(pp->dbi_base + PCIE_PHY_DEBUG_R0),
-		readl(pp->dbi_base + PCIE_PHY_DEBUG_R1));
+		dw_pcie_readl_rc(pp, PCIE_PHY_DEBUG_R0),
+		dw_pcie_readl_rc(pp, PCIE_PHY_DEBUG_R1));
 
 	return -ETIMEDOUT;
 }
 
-static void artpec6_pcie_enable_interrupts(struct pcie_port *pp)
+static void artpec6_pcie_enable_interrupts(struct artpec6_pcie *artpec6_pcie)
 {
+	struct pcie_port *pp = &artpec6_pcie->pp;
+
 	if (IS_ENABLED(CONFIG_PCI_MSI))
 		dw_pcie_msi_init(pp);
 }
 
 static void artpec6_pcie_host_init(struct pcie_port *pp)
 {
-	artpec6_pcie_establish_link(pp);
-	artpec6_pcie_enable_interrupts(pp);
-}
-
-static int artpec6_pcie_link_up(struct pcie_port *pp)
-{
-	u32 rc;
-
-	/*
-	 * Get status from Synopsys IP
-	 * link is debug bit 36, debug register 1 starts at bit 32
-	 */
-	rc = readl(pp->dbi_base + PCIE_PHY_DEBUG_R1) & (0x1 << (36 - 32));
-	if (rc)
-		return 1;
+	struct artpec6_pcie *artpec6_pcie = to_artpec6_pcie(pp);
 
-	return 0;
+	artpec6_pcie_establish_link(artpec6_pcie);
+	artpec6_pcie_enable_interrupts(artpec6_pcie);
 }
 
 static struct pcie_host_ops artpec6_pcie_host_ops = {
-	.link_up = artpec6_pcie_link_up,
 	.host_init = artpec6_pcie_host_init,
 };
 
 static irqreturn_t artpec6_pcie_msi_handler(int irq, void *arg)
 {
-	struct pcie_port *pp = arg;
+	struct artpec6_pcie *artpec6_pcie = arg;
+	struct pcie_port *pp = &artpec6_pcie->pp;
 
 	return dw_handle_msi_irq(pp);
 }
 
-static int artpec6_add_pcie_port(struct pcie_port *pp,
+static int artpec6_add_pcie_port(struct artpec6_pcie *artpec6_pcie,
 				 struct platform_device *pdev)
 {
+	struct pcie_port *pp = &artpec6_pcie->pp;
+	struct device *dev = pp->dev;
 	int ret;
 
 	if (IS_ENABLED(CONFIG_PCI_MSI)) {
 		pp->msi_irq = platform_get_irq_byname(pdev, "msi");
 		if (pp->msi_irq <= 0) {
-			dev_err(&pdev->dev, "failed to get MSI irq\n");
+			dev_err(dev, "failed to get MSI irq\n");
 			return -ENODEV;
 		}
 
-		ret = devm_request_irq(&pdev->dev, pp->msi_irq,
+		ret = devm_request_irq(dev, pp->msi_irq,
 				       artpec6_pcie_msi_handler,
 				       IRQF_SHARED | IRQF_NO_THREAD,
-				       "artpec6-pcie-msi", pp);
+				       "artpec6-pcie-msi", artpec6_pcie);
 		if (ret) {
-			dev_err(&pdev->dev, "failed to request MSI irq\n");
+			dev_err(dev, "failed to request MSI irq\n");
 			return ret;
 		}
 	}
@@ -218,7 +222,7 @@ static int artpec6_add_pcie_port(struct pcie_port *pp,
 
 	ret = dw_pcie_host_init(pp);
 	if (ret) {
-		dev_err(&pdev->dev, "failed to initialize host\n");
+		dev_err(dev, "failed to initialize host\n");
 		return ret;
 	}
 
@@ -227,41 +231,40 @@ static int artpec6_add_pcie_port(struct pcie_port *pp,
 
 static int artpec6_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct artpec6_pcie *artpec6_pcie;
 	struct pcie_port *pp;
 	struct resource *dbi_base;
 	struct resource *phy_base;
 	int ret;
 
-	artpec6_pcie = devm_kzalloc(&pdev->dev, sizeof(*artpec6_pcie),
-				    GFP_KERNEL);
+	artpec6_pcie = devm_kzalloc(dev, sizeof(*artpec6_pcie), GFP_KERNEL);
 	if (!artpec6_pcie)
 		return -ENOMEM;
 
 	pp = &artpec6_pcie->pp;
-	pp->dev = &pdev->dev;
+	pp->dev = dev;
 
 	dbi_base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi");
-	pp->dbi_base = devm_ioremap_resource(&pdev->dev, dbi_base);
+	pp->dbi_base = devm_ioremap_resource(dev, dbi_base);
 	if (IS_ERR(pp->dbi_base))
 		return PTR_ERR(pp->dbi_base);
 
 	phy_base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "phy");
-	artpec6_pcie->phy_base = devm_ioremap_resource(&pdev->dev, phy_base);
+	artpec6_pcie->phy_base = devm_ioremap_resource(dev, phy_base);
 	if (IS_ERR(artpec6_pcie->phy_base))
 		return PTR_ERR(artpec6_pcie->phy_base);
 
 	artpec6_pcie->regmap =
-		syscon_regmap_lookup_by_phandle(pdev->dev.of_node,
+		syscon_regmap_lookup_by_phandle(dev->of_node,
 						"axis,syscon-pcie");
 	if (IS_ERR(artpec6_pcie->regmap))
 		return PTR_ERR(artpec6_pcie->regmap);
 
-	ret = artpec6_add_pcie_port(pp, pdev);
+	ret = artpec6_add_pcie_port(artpec6_pcie, pdev);
 	if (ret < 0)
 		return ret;
 
-	platform_set_drvdata(pdev, artpec6_pcie);
 	return 0;
 }
 
diff --git a/drivers/pci/host/pcie-designware-plat.c b/drivers/pci/host/pcie-designware-plat.c
index 17da005497a5..537f58a664fa 100644
--- a/drivers/pci/host/pcie-designware-plat.c
+++ b/drivers/pci/host/pcie-designware-plat.c
@@ -25,8 +25,7 @@
 #include "pcie-designware.h"
 
 struct dw_plat_pcie {
-	void __iomem		*mem_base;
-	struct pcie_port	pp;
+	struct pcie_port	pp;	/* pp.dbi_base is DT 0th resource */
 };
 
 static irqreturn_t dw_plat_pcie_msi_irq_handler(int irq, void *arg)
@@ -52,6 +51,7 @@ static struct pcie_host_ops dw_plat_pcie_host_ops = {
 static int dw_plat_add_pcie_port(struct pcie_port *pp,
 				 struct platform_device *pdev)
 {
+	struct device *dev = pp->dev;
 	int ret;
 
 	pp->irq = platform_get_irq(pdev, 1);
@@ -63,11 +63,11 @@ static int dw_plat_add_pcie_port(struct pcie_port *pp,
 		if (pp->msi_irq < 0)
 			return pp->msi_irq;
 
-		ret = devm_request_irq(&pdev->dev, pp->msi_irq,
+		ret = devm_request_irq(dev, pp->msi_irq,
 					dw_plat_pcie_msi_irq_handler,
 					IRQF_SHARED, "dw-plat-pcie-msi", pp);
 		if (ret) {
-			dev_err(&pdev->dev, "failed to request MSI IRQ\n");
+			dev_err(dev, "failed to request MSI IRQ\n");
 			return ret;
 		}
 	}
@@ -77,7 +77,7 @@ static int dw_plat_add_pcie_port(struct pcie_port *pp,
 
 	ret = dw_pcie_host_init(pp);
 	if (ret) {
-		dev_err(&pdev->dev, "failed to initialize host\n");
+		dev_err(dev, "failed to initialize host\n");
 		return ret;
 	}
 
@@ -86,31 +86,28 @@ static int dw_plat_add_pcie_port(struct pcie_port *pp,
 
 static int dw_plat_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct dw_plat_pcie *dw_plat_pcie;
 	struct pcie_port *pp;
 	struct resource *res;  /* Resource from DT */
 	int ret;
 
-	dw_plat_pcie = devm_kzalloc(&pdev->dev, sizeof(*dw_plat_pcie),
-					GFP_KERNEL);
+	dw_plat_pcie = devm_kzalloc(dev, sizeof(*dw_plat_pcie), GFP_KERNEL);
 	if (!dw_plat_pcie)
 		return -ENOMEM;
 
 	pp = &dw_plat_pcie->pp;
-	pp->dev = &pdev->dev;
+	pp->dev = dev;
 
 	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-	dw_plat_pcie->mem_base = devm_ioremap_resource(&pdev->dev, res);
-	if (IS_ERR(dw_plat_pcie->mem_base))
-		return PTR_ERR(dw_plat_pcie->mem_base);
-
-	pp->dbi_base = dw_plat_pcie->mem_base;
+	pp->dbi_base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(pp->dbi_base))
+		return PTR_ERR(pp->dbi_base);
 
 	ret = dw_plat_add_pcie_port(pp, pdev);
 	if (ret < 0)
 		return ret;
 
-	platform_set_drvdata(pdev, dw_plat_pcie);
 	return 0;
 }
 
diff --git a/drivers/pci/host/pcie-designware.c b/drivers/pci/host/pcie-designware.c
index 74da71ea544a..035f50c03281 100644
--- a/drivers/pci/host/pcie-designware.c
+++ b/drivers/pci/host/pcie-designware.c
@@ -141,41 +141,35 @@ int dw_pcie_cfg_write(void __iomem *addr, int size, u32 val)
 	return PCIBIOS_SUCCESSFUL;
 }
 
-static inline u32 dw_pcie_readl_rc(struct pcie_port *pp, u32 reg)
+u32 dw_pcie_readl_rc(struct pcie_port *pp, u32 reg)
 {
 	if (pp->ops->readl_rc)
-		return pp->ops->readl_rc(pp, pp->dbi_base + reg);
+		return pp->ops->readl_rc(pp, reg);
 
 	return readl(pp->dbi_base + reg);
 }
 
-static inline void dw_pcie_writel_rc(struct pcie_port *pp, u32 val, u32 reg)
+void dw_pcie_writel_rc(struct pcie_port *pp, u32 reg, u32 val)
 {
 	if (pp->ops->writel_rc)
-		pp->ops->writel_rc(pp, val, pp->dbi_base + reg);
+		pp->ops->writel_rc(pp, reg, val);
 	else
 		writel(val, pp->dbi_base + reg);
 }
 
-static inline u32 dw_pcie_readl_unroll(struct pcie_port *pp, u32 index, u32 reg)
+static u32 dw_pcie_readl_unroll(struct pcie_port *pp, u32 index, u32 reg)
 {
 	u32 offset = PCIE_GET_ATU_OUTB_UNR_REG_OFFSET(index);
 
-	if (pp->ops->readl_rc)
-		return pp->ops->readl_rc(pp, pp->dbi_base + offset + reg);
-
-	return readl(pp->dbi_base + offset + reg);
+	return dw_pcie_readl_rc(pp, offset + reg);
 }
 
-static inline void dw_pcie_writel_unroll(struct pcie_port *pp, u32 index,
-					 u32 val, u32 reg)
+static void dw_pcie_writel_unroll(struct pcie_port *pp, u32 index, u32 reg,
+				  u32 val)
 {
 	u32 offset = PCIE_GET_ATU_OUTB_UNR_REG_OFFSET(index);
 
-	if (pp->ops->writel_rc)
-		pp->ops->writel_rc(pp, val, pp->dbi_base + offset + reg);
-	else
-		writel(val, pp->dbi_base + offset + reg);
+	dw_pcie_writel_rc(pp, offset + reg, val);
 }
 
 static int dw_pcie_rd_own_conf(struct pcie_port *pp, int where, int size,
@@ -202,35 +196,35 @@ static void dw_pcie_prog_outbound_atu(struct pcie_port *pp, int index,
 	u32 retries, val;
 
 	if (pp->iatu_unroll_enabled) {
-		dw_pcie_writel_unroll(pp, index,
-			lower_32_bits(cpu_addr), PCIE_ATU_UNR_LOWER_BASE);
-		dw_pcie_writel_unroll(pp, index,
-			upper_32_bits(cpu_addr), PCIE_ATU_UNR_UPPER_BASE);
-		dw_pcie_writel_unroll(pp, index,
-			lower_32_bits(cpu_addr + size - 1), PCIE_ATU_UNR_LIMIT);
-		dw_pcie_writel_unroll(pp, index,
-			lower_32_bits(pci_addr), PCIE_ATU_UNR_LOWER_TARGET);
-		dw_pcie_writel_unroll(pp, index,
-			upper_32_bits(pci_addr), PCIE_ATU_UNR_UPPER_TARGET);
-		dw_pcie_writel_unroll(pp, index,
-			type, PCIE_ATU_UNR_REGION_CTRL1);
-		dw_pcie_writel_unroll(pp, index,
-			PCIE_ATU_ENABLE, PCIE_ATU_UNR_REGION_CTRL2);
+		dw_pcie_writel_unroll(pp, index, PCIE_ATU_UNR_LOWER_BASE,
+			lower_32_bits(cpu_addr));
+		dw_pcie_writel_unroll(pp, index, PCIE_ATU_UNR_UPPER_BASE,
+			upper_32_bits(cpu_addr));
+		dw_pcie_writel_unroll(pp, index, PCIE_ATU_UNR_LIMIT,
+			lower_32_bits(cpu_addr + size - 1));
+		dw_pcie_writel_unroll(pp, index, PCIE_ATU_UNR_LOWER_TARGET,
+			lower_32_bits(pci_addr));
+		dw_pcie_writel_unroll(pp, index, PCIE_ATU_UNR_UPPER_TARGET,
+			upper_32_bits(pci_addr));
+		dw_pcie_writel_unroll(pp, index, PCIE_ATU_UNR_REGION_CTRL1,
+			type);
+		dw_pcie_writel_unroll(pp, index, PCIE_ATU_UNR_REGION_CTRL2,
+			PCIE_ATU_ENABLE);
 	} else {
-		dw_pcie_writel_rc(pp, PCIE_ATU_REGION_OUTBOUND | index,
-						PCIE_ATU_VIEWPORT);
-		dw_pcie_writel_rc(pp, lower_32_bits(cpu_addr),
-						PCIE_ATU_LOWER_BASE);
-		dw_pcie_writel_rc(pp, upper_32_bits(cpu_addr),
-						PCIE_ATU_UPPER_BASE);
-		dw_pcie_writel_rc(pp, lower_32_bits(cpu_addr + size - 1),
-						PCIE_ATU_LIMIT);
-		dw_pcie_writel_rc(pp, lower_32_bits(pci_addr),
-						PCIE_ATU_LOWER_TARGET);
-		dw_pcie_writel_rc(pp, upper_32_bits(pci_addr),
-						PCIE_ATU_UPPER_TARGET);
-		dw_pcie_writel_rc(pp, type, PCIE_ATU_CR1);
-		dw_pcie_writel_rc(pp, PCIE_ATU_ENABLE, PCIE_ATU_CR2);
+		dw_pcie_writel_rc(pp, PCIE_ATU_VIEWPORT,
+				  PCIE_ATU_REGION_OUTBOUND | index);
+		dw_pcie_writel_rc(pp, PCIE_ATU_LOWER_BASE,
+				  lower_32_bits(cpu_addr));
+		dw_pcie_writel_rc(pp, PCIE_ATU_UPPER_BASE,
+				  upper_32_bits(cpu_addr));
+		dw_pcie_writel_rc(pp, PCIE_ATU_LIMIT,
+				  lower_32_bits(cpu_addr + size - 1));
+		dw_pcie_writel_rc(pp, PCIE_ATU_LOWER_TARGET,
+				  lower_32_bits(pci_addr));
+		dw_pcie_writel_rc(pp, PCIE_ATU_UPPER_TARGET,
+				  upper_32_bits(pci_addr));
+		dw_pcie_writel_rc(pp, PCIE_ATU_CR1, type);
+		dw_pcie_writel_rc(pp, PCIE_ATU_CR2, PCIE_ATU_ENABLE);
 	}
 
 	/*
@@ -760,8 +754,8 @@ static int dw_pcie_wr_other_conf(struct pcie_port *pp, struct pci_bus *bus,
 	return ret;
 }
 
-static int dw_pcie_valid_config(struct pcie_port *pp,
-				struct pci_bus *bus, int dev)
+static int dw_pcie_valid_device(struct pcie_port *pp, struct pci_bus *bus,
+				int dev)
 {
 	/* If there is no link, then there is no device */
 	if (bus->number != pp->root_bus_nr) {
@@ -781,7 +775,7 @@ static int dw_pcie_rd_conf(struct pci_bus *bus, u32 devfn, int where,
 {
 	struct pcie_port *pp = bus->sysdata;
 
-	if (dw_pcie_valid_config(pp, bus, PCI_SLOT(devfn)) == 0) {
+	if (!dw_pcie_valid_device(pp, bus, PCI_SLOT(devfn))) {
 		*val = 0xffffffff;
 		return PCIBIOS_DEVICE_NOT_FOUND;
 	}
@@ -797,7 +791,7 @@ static int dw_pcie_wr_conf(struct pci_bus *bus, u32 devfn,
 {
 	struct pcie_port *pp = bus->sysdata;
 
-	if (dw_pcie_valid_config(pp, bus, PCI_SLOT(devfn)) == 0)
+	if (!dw_pcie_valid_device(pp, bus, PCI_SLOT(devfn)))
 		return PCIBIOS_DEVICE_NOT_FOUND;
 
 	if (bus->number == pp->root_bus_nr)
@@ -835,7 +829,7 @@ void dw_pcie_setup_rc(struct pcie_port *pp)
 		dev_err(pp->dev, "num-lanes %u: invalid value\n", pp->lanes);
 		return;
 	}
-	dw_pcie_writel_rc(pp, val, PCIE_PORT_LINK_CONTROL);
+	dw_pcie_writel_rc(pp, PCIE_PORT_LINK_CONTROL, val);
 
 	/* set link width speed control register */
 	val = dw_pcie_readl_rc(pp, PCIE_LINK_WIDTH_SPEED_CONTROL);
@@ -854,30 +848,30 @@ void dw_pcie_setup_rc(struct pcie_port *pp)
 		val |= PORT_LOGIC_LINK_WIDTH_8_LANES;
 		break;
 	}
-	dw_pcie_writel_rc(pp, val, PCIE_LINK_WIDTH_SPEED_CONTROL);
+	dw_pcie_writel_rc(pp, PCIE_LINK_WIDTH_SPEED_CONTROL, val);
 
 	/* setup RC BARs */
-	dw_pcie_writel_rc(pp, 0x00000004, PCI_BASE_ADDRESS_0);
-	dw_pcie_writel_rc(pp, 0x00000000, PCI_BASE_ADDRESS_1);
+	dw_pcie_writel_rc(pp, PCI_BASE_ADDRESS_0, 0x00000004);
+	dw_pcie_writel_rc(pp, PCI_BASE_ADDRESS_1, 0x00000000);
 
 	/* setup interrupt pins */
 	val = dw_pcie_readl_rc(pp, PCI_INTERRUPT_LINE);
 	val &= 0xffff00ff;
 	val |= 0x00000100;
-	dw_pcie_writel_rc(pp, val, PCI_INTERRUPT_LINE);
+	dw_pcie_writel_rc(pp, PCI_INTERRUPT_LINE, val);
 
 	/* setup bus numbers */
 	val = dw_pcie_readl_rc(pp, PCI_PRIMARY_BUS);
 	val &= 0xff000000;
 	val |= 0x00010100;
-	dw_pcie_writel_rc(pp, val, PCI_PRIMARY_BUS);
+	dw_pcie_writel_rc(pp, PCI_PRIMARY_BUS, val);
 
 	/* setup command register */
 	val = dw_pcie_readl_rc(pp, PCI_COMMAND);
 	val &= 0xffff0000;
 	val |= PCI_COMMAND_IO | PCI_COMMAND_MEMORY |
 		PCI_COMMAND_MASTER | PCI_COMMAND_SERR;
-	dw_pcie_writel_rc(pp, val, PCI_COMMAND);
+	dw_pcie_writel_rc(pp, PCI_COMMAND, val);
 
 	/*
 	 * If the platform provides ->rd_other_conf, it means the platform
diff --git a/drivers/pci/host/pcie-designware.h b/drivers/pci/host/pcie-designware.h
index c8e5bc647f49..a567ea288ee2 100644
--- a/drivers/pci/host/pcie-designware.h
+++ b/drivers/pci/host/pcie-designware.h
@@ -54,9 +54,8 @@ struct pcie_port {
 };
 
 struct pcie_host_ops {
-	u32 (*readl_rc)(struct pcie_port *pp, void __iomem *dbi_base);
-	void (*writel_rc)(struct pcie_port *pp,
-			u32 val, void __iomem *dbi_base);
+	u32 (*readl_rc)(struct pcie_port *pp, u32 reg);
+	void (*writel_rc)(struct pcie_port *pp, u32 reg, u32 val);
 	int (*rd_own_conf)(struct pcie_port *pp, int where, int size, u32 *val);
 	int (*wr_own_conf)(struct pcie_port *pp, int where, int size, u32 val);
 	int (*rd_other_conf)(struct pcie_port *pp, struct pci_bus *bus,
@@ -73,6 +72,8 @@ struct pcie_host_ops {
 	int (*msi_host_init)(struct pcie_port *pp, struct msi_controller *chip);
 };
 
+u32 dw_pcie_readl_rc(struct pcie_port *pp, u32 reg);
+void dw_pcie_writel_rc(struct pcie_port *pp, u32 reg, u32 val);
 int dw_pcie_cfg_read(void __iomem *addr, int size, u32 *val);
 int dw_pcie_cfg_write(void __iomem *addr, int size, u32 val);
 irqreturn_t dw_handle_msi_irq(struct pcie_port *pp);
diff --git a/drivers/pci/host/pcie-hisi.c b/drivers/pci/host/pcie-hisi.c
index 7ee9dfcc45fb..56154c25980c 100644
--- a/drivers/pci/host/pcie-hisi.c
+++ b/drivers/pci/host/pcie-hisi.c
@@ -22,51 +22,38 @@
 
 #include "pcie-designware.h"
 
-#define PCIE_LTSSM_LINKUP_STATE				0x11
-#define PCIE_LTSSM_STATE_MASK				0x3F
-#define PCIE_SUBCTRL_SYS_STATE4_REG			0x6818
-#define PCIE_SYS_STATE4						0x31c
-#define PCIE_HIP06_CTRL_OFF					0x1000
+#define PCIE_SUBCTRL_SYS_STATE4_REG		0x6818
+#define PCIE_HIP06_CTRL_OFF			0x1000
+#define PCIE_SYS_STATE4				(PCIE_HIP06_CTRL_OFF + 0x31c)
+#define PCIE_LTSSM_LINKUP_STATE			0x11
+#define PCIE_LTSSM_STATE_MASK			0x3F
 
 #define to_hisi_pcie(x)	container_of(x, struct hisi_pcie, pp)
 
 struct hisi_pcie;
 
 struct pcie_soc_ops {
-	int (*hisi_pcie_link_up)(struct hisi_pcie *pcie);
+	int (*hisi_pcie_link_up)(struct hisi_pcie *hisi_pcie);
 };
 
 struct hisi_pcie {
+	struct pcie_port pp;		/* pp.dbi_base is DT rc_dbi */
 	struct regmap *subctrl;
-	void __iomem *reg_base;
 	u32 port_id;
-	struct pcie_port pp;
 	struct pcie_soc_ops *soc_ops;
 };
 
-static inline void hisi_pcie_apb_writel(struct hisi_pcie *pcie,
-					u32 val, u32 reg)
-{
-	writel(val, pcie->reg_base + reg);
-}
-
-static inline u32 hisi_pcie_apb_readl(struct hisi_pcie *pcie, u32 reg)
-{
-	return readl(pcie->reg_base + reg);
-}
-
 /* HipXX PCIe host only supports 32-bit config access */
 static int hisi_pcie_cfg_read(struct pcie_port *pp, int where, int size,
 			      u32 *val)
 {
 	u32 reg;
 	u32 reg_val;
-	struct hisi_pcie *pcie = to_hisi_pcie(pp);
 	void *walker = &reg_val;
 
 	walker += (where & 0x3);
 	reg = where & ~0x3;
-	reg_val = hisi_pcie_apb_readl(pcie, reg);
+	reg_val = dw_pcie_readl_rc(pp, reg);
 
 	if (size == 1)
 		*val = *(u8 __force *) walker;
@@ -86,21 +73,20 @@ static int hisi_pcie_cfg_write(struct pcie_port *pp, int where, int  size,
 {
 	u32 reg_val;
 	u32 reg;
-	struct hisi_pcie *pcie = to_hisi_pcie(pp);
 	void *walker = &reg_val;
 
 	walker += (where & 0x3);
 	reg = where & ~0x3;
 	if (size == 4)
-		hisi_pcie_apb_writel(pcie, val, reg);
+		dw_pcie_writel_rc(pp, reg, val);
 	else if (size == 2) {
-		reg_val = hisi_pcie_apb_readl(pcie, reg);
+		reg_val = dw_pcie_readl_rc(pp, reg);
 		*(u16 __force *) walker = val;
-		hisi_pcie_apb_writel(pcie, reg_val, reg);
+		dw_pcie_writel_rc(pp, reg, reg_val);
 	} else if (size == 1) {
-		reg_val = hisi_pcie_apb_readl(pcie, reg);
+		reg_val = dw_pcie_readl_rc(pp, reg);
 		*(u8 __force *) walker = val;
-		hisi_pcie_apb_writel(pcie, reg_val, reg);
+		dw_pcie_writel_rc(pp, reg, reg_val);
 	} else
 		return PCIBIOS_BAD_REGISTER_NUMBER;
 
@@ -119,10 +105,10 @@ static int hisi_pcie_link_up_hip05(struct hisi_pcie *hisi_pcie)
 
 static int hisi_pcie_link_up_hip06(struct hisi_pcie *hisi_pcie)
 {
+	struct pcie_port *pp = &hisi_pcie->pp;
 	u32 val;
 
-	val = hisi_pcie_apb_readl(hisi_pcie, PCIE_HIP06_CTRL_OFF +
-			PCIE_SYS_STATE4);
+	val = dw_pcie_readl_rc(pp, PCIE_SYS_STATE4);
 
 	return ((val & PCIE_LTSSM_STATE_MASK) == PCIE_LTSSM_LINKUP_STATE);
 }
@@ -140,19 +126,20 @@ static struct pcie_host_ops hisi_pcie_host_ops = {
 	.link_up = hisi_pcie_link_up,
 };
 
-static int hisi_add_pcie_port(struct pcie_port *pp,
-				     struct platform_device *pdev)
+static int hisi_add_pcie_port(struct hisi_pcie *hisi_pcie,
+			      struct platform_device *pdev)
 {
+	struct pcie_port *pp = &hisi_pcie->pp;
+	struct device *dev = pp->dev;
 	int ret;
 	u32 port_id;
-	struct hisi_pcie *hisi_pcie = to_hisi_pcie(pp);
 
-	if (of_property_read_u32(pdev->dev.of_node, "port-id", &port_id)) {
-		dev_err(&pdev->dev, "failed to read port-id\n");
+	if (of_property_read_u32(dev->of_node, "port-id", &port_id)) {
+		dev_err(dev, "failed to read port-id\n");
 		return -EINVAL;
 	}
 	if (port_id > 3) {
-		dev_err(&pdev->dev, "Invalid port-id: %d\n", port_id);
+		dev_err(dev, "Invalid port-id: %d\n", port_id);
 		return -EINVAL;
 	}
 	hisi_pcie->port_id = port_id;
@@ -161,7 +148,7 @@ static int hisi_add_pcie_port(struct pcie_port *pp,
 
 	ret = dw_pcie_host_init(pp);
 	if (ret) {
-		dev_err(&pdev->dev, "failed to initialize host\n");
+		dev_err(dev, "failed to initialize host\n");
 		return ret;
 	}
 
@@ -170,6 +157,7 @@ static int hisi_add_pcie_port(struct pcie_port *pp,
 
 static int hisi_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct hisi_pcie *hisi_pcie;
 	struct pcie_port *pp;
 	const struct of_device_id *match;
@@ -177,40 +165,36 @@ static int hisi_pcie_probe(struct platform_device *pdev)
 	struct device_driver *driver;
 	int ret;
 
-	hisi_pcie = devm_kzalloc(&pdev->dev, sizeof(*hisi_pcie), GFP_KERNEL);
+	hisi_pcie = devm_kzalloc(dev, sizeof(*hisi_pcie), GFP_KERNEL);
 	if (!hisi_pcie)
 		return -ENOMEM;
 
 	pp = &hisi_pcie->pp;
-	pp->dev = &pdev->dev;
-	driver = (pdev->dev).driver;
+	pp->dev = dev;
+	driver = dev->driver;
 
-	match = of_match_device(driver->of_match_table, &pdev->dev);
+	match = of_match_device(driver->of_match_table, dev);
 	hisi_pcie->soc_ops = (struct pcie_soc_ops *) match->data;
 
 	hisi_pcie->subctrl =
 	syscon_regmap_lookup_by_compatible("hisilicon,pcie-sas-subctrl");
 	if (IS_ERR(hisi_pcie->subctrl)) {
-		dev_err(pp->dev, "cannot get subctrl base\n");
+		dev_err(dev, "cannot get subctrl base\n");
 		return PTR_ERR(hisi_pcie->subctrl);
 	}
 
 	reg = platform_get_resource_byname(pdev, IORESOURCE_MEM, "rc_dbi");
-	hisi_pcie->reg_base = devm_ioremap_resource(&pdev->dev, reg);
-	if (IS_ERR(hisi_pcie->reg_base)) {
-		dev_err(pp->dev, "cannot get rc_dbi base\n");
-		return PTR_ERR(hisi_pcie->reg_base);
+	pp->dbi_base = devm_ioremap_resource(dev, reg);
+	if (IS_ERR(pp->dbi_base)) {
+		dev_err(dev, "cannot get rc_dbi base\n");
+		return PTR_ERR(pp->dbi_base);
 	}
 
-	hisi_pcie->pp.dbi_base = hisi_pcie->reg_base;
-
-	ret = hisi_add_pcie_port(pp, pdev);
+	ret = hisi_add_pcie_port(hisi_pcie, pdev);
 	if (ret)
 		return ret;
 
-	platform_set_drvdata(pdev, hisi_pcie);
-
-	dev_warn(pp->dev, "only 32-bit config accesses supported; smaller writes may corrupt adjacent RW1C fields\n");
+	dev_warn(dev, "only 32-bit config accesses supported; smaller writes may corrupt adjacent RW1C fields\n");
 
 	return 0;
 }
diff --git a/drivers/pci/host/pcie-iproc-bcma.c b/drivers/pci/host/pcie-iproc-bcma.c
index 0d7bee4a0d26..8ce089043a27 100644
--- a/drivers/pci/host/pcie-iproc-bcma.c
+++ b/drivers/pci/host/pcie-iproc-bcma.c
@@ -42,19 +42,24 @@ static int iproc_pcie_bcma_map_irq(const struct pci_dev *dev, u8 slot, u8 pin)
 
 static int iproc_pcie_bcma_probe(struct bcma_device *bdev)
 {
+	struct device *dev = &bdev->dev;
 	struct iproc_pcie *pcie;
 	LIST_HEAD(res);
 	struct resource res_mem;
 	int ret;
 
-	pcie = devm_kzalloc(&bdev->dev, sizeof(*pcie), GFP_KERNEL);
+	pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
 	if (!pcie)
 		return -ENOMEM;
 
-	pcie->dev = &bdev->dev;
-	bcma_set_drvdata(bdev, pcie);
+	pcie->dev = dev;
 
 	pcie->base = bdev->io_addr;
+	if (!pcie->base) {
+		dev_err(dev, "no controller registers\n");
+		return -ENOMEM;
+	}
+
 	pcie->base_addr = bdev->addr;
 
 	res_mem.start = bdev->addr_s[0];
@@ -67,10 +72,11 @@ static int iproc_pcie_bcma_probe(struct bcma_device *bdev)
 
 	ret = iproc_pcie_setup(pcie, &res);
 	if (ret)
-		dev_err(pcie->dev, "PCIe controller setup failed\n");
+		dev_err(dev, "PCIe controller setup failed\n");
 
 	pci_free_resource_list(&res);
 
+	bcma_set_drvdata(bdev, pcie);
 	return ret;
 }
 
diff --git a/drivers/pci/host/pcie-iproc-platform.c b/drivers/pci/host/pcie-iproc-platform.c
index 1738c5288eb6..a3de087976b3 100644
--- a/drivers/pci/host/pcie-iproc-platform.c
+++ b/drivers/pci/host/pcie-iproc-platform.c
@@ -40,35 +40,35 @@ MODULE_DEVICE_TABLE(of, iproc_pcie_of_match_table);
 
 static int iproc_pcie_pltfm_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	const struct of_device_id *of_id;
 	struct iproc_pcie *pcie;
-	struct device_node *np = pdev->dev.of_node;
+	struct device_node *np = dev->of_node;
 	struct resource reg;
 	resource_size_t iobase = 0;
 	LIST_HEAD(res);
 	int ret;
 
-	of_id = of_match_device(iproc_pcie_of_match_table, &pdev->dev);
+	of_id = of_match_device(iproc_pcie_of_match_table, dev);
 	if (!of_id)
 		return -EINVAL;
 
-	pcie = devm_kzalloc(&pdev->dev, sizeof(struct iproc_pcie), GFP_KERNEL);
+	pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
 	if (!pcie)
 		return -ENOMEM;
 
-	pcie->dev = &pdev->dev;
+	pcie->dev = dev;
 	pcie->type = (enum iproc_pcie_type)of_id->data;
-	platform_set_drvdata(pdev, pcie);
 
 	ret = of_address_to_resource(np, 0, &reg);
 	if (ret < 0) {
-		dev_err(pcie->dev, "unable to obtain controller resources\n");
+		dev_err(dev, "unable to obtain controller resources\n");
 		return ret;
 	}
 
-	pcie->base = devm_ioremap(pcie->dev, reg.start, resource_size(&reg));
+	pcie->base = devm_ioremap(dev, reg.start, resource_size(&reg));
 	if (!pcie->base) {
-		dev_err(pcie->dev, "unable to map controller registers\n");
+		dev_err(dev, "unable to map controller registers\n");
 		return -ENOMEM;
 	}
 	pcie->base_addr = reg.start;
@@ -79,7 +79,7 @@ static int iproc_pcie_pltfm_probe(struct platform_device *pdev)
 		ret = of_property_read_u32(np, "brcm,pcie-ob-axi-offset",
 					   &val);
 		if (ret) {
-			dev_err(pcie->dev,
+			dev_err(dev,
 				"missing brcm,pcie-ob-axi-offset property\n");
 			return ret;
 		}
@@ -88,7 +88,7 @@ static int iproc_pcie_pltfm_probe(struct platform_device *pdev)
 		ret = of_property_read_u32(np, "brcm,pcie-ob-window-size",
 					   &val);
 		if (ret) {
-			dev_err(pcie->dev,
+			dev_err(dev,
 				"missing brcm,pcie-ob-window-size property\n");
 			return ret;
 		}
@@ -101,7 +101,7 @@ static int iproc_pcie_pltfm_probe(struct platform_device *pdev)
 	}
 
 	/* PHY use is optional */
-	pcie->phy = devm_phy_get(&pdev->dev, "pcie-phy");
+	pcie->phy = devm_phy_get(dev, "pcie-phy");
 	if (IS_ERR(pcie->phy)) {
 		if (PTR_ERR(pcie->phy) == -EPROBE_DEFER)
 			return -EPROBE_DEFER;
@@ -110,7 +110,7 @@ static int iproc_pcie_pltfm_probe(struct platform_device *pdev)
 
 	ret = of_pci_get_host_bridge_resources(np, 0, 0xff, &res, &iobase);
 	if (ret) {
-		dev_err(pcie->dev,
+		dev_err(dev,
 			"unable to get PCI host bridge resources\n");
 		return ret;
 	}
@@ -119,10 +119,11 @@ static int iproc_pcie_pltfm_probe(struct platform_device *pdev)
 
 	ret = iproc_pcie_setup(pcie, &res);
 	if (ret)
-		dev_err(pcie->dev, "PCIe controller setup failed\n");
+		dev_err(dev, "PCIe controller setup failed\n");
 
 	pci_free_resource_list(&res);
 
+	platform_set_drvdata(pdev, pcie);
 	return ret;
 }
 
diff --git a/drivers/pci/host/pcie-iproc.c b/drivers/pci/host/pcie-iproc.c
index e167b2f0098d..0b999a9fb843 100644
--- a/drivers/pci/host/pcie-iproc.c
+++ b/drivers/pci/host/pcie-iproc.c
@@ -63,6 +63,8 @@
 #define OARR_SIZE_CFG_SHIFT          1
 #define OARR_SIZE_CFG                BIT(OARR_SIZE_CFG_SHIFT)
 
+#define PCI_EXP_CAP			0xac
+
 #define MAX_NUM_OB_WINDOWS           2
 
 #define IPROC_PCIE_REG_INVALID 0xffff
@@ -258,9 +260,10 @@ static void iproc_pcie_reset(struct iproc_pcie *pcie)
 
 static int iproc_pcie_check_link(struct iproc_pcie *pcie, struct pci_bus *bus)
 {
+	struct device *dev = pcie->dev;
 	u8 hdr_type;
 	u32 link_ctrl, class, val;
-	u16 pos, link_status;
+	u16 pos = PCI_EXP_CAP, link_status;
 	bool link_is_active = false;
 
 	/*
@@ -272,14 +275,14 @@ static int iproc_pcie_check_link(struct iproc_pcie *pcie, struct pci_bus *bus)
 
 	val = iproc_pcie_read_reg(pcie, IPROC_PCIE_LINK_STATUS);
 	if (!(val & PCIE_PHYLINKUP) || !(val & PCIE_DL_ACTIVE)) {
-		dev_err(pcie->dev, "PHY or data link is INACTIVE!\n");
+		dev_err(dev, "PHY or data link is INACTIVE!\n");
 		return -ENODEV;
 	}
 
 	/* make sure we are not in EP mode */
 	pci_bus_read_config_byte(bus, 0, PCI_HEADER_TYPE, &hdr_type);
 	if ((hdr_type & 0x7f) != PCI_HEADER_TYPE_BRIDGE) {
-		dev_err(pcie->dev, "in EP mode, hdr=%#02x\n", hdr_type);
+		dev_err(dev, "in EP mode, hdr=%#02x\n", hdr_type);
 		return -EFAULT;
 	}
 
@@ -293,30 +296,27 @@ static int iproc_pcie_check_link(struct iproc_pcie *pcie, struct pci_bus *bus)
 	pci_bus_write_config_dword(bus, 0, PCI_BRIDGE_CTRL_REG_OFFSET, class);
 
 	/* check link status to see if link is active */
-	pos = pci_bus_find_capability(bus, 0, PCI_CAP_ID_EXP);
 	pci_bus_read_config_word(bus, 0, pos + PCI_EXP_LNKSTA, &link_status);
 	if (link_status & PCI_EXP_LNKSTA_NLW)
 		link_is_active = true;
 
 	if (!link_is_active) {
 		/* try GEN 1 link speed */
-#define PCI_LINK_STATUS_CTRL_2_OFFSET 0x0dc
 #define PCI_TARGET_LINK_SPEED_MASK    0xf
 #define PCI_TARGET_LINK_SPEED_GEN2    0x2
 #define PCI_TARGET_LINK_SPEED_GEN1    0x1
 		pci_bus_read_config_dword(bus, 0,
-					  PCI_LINK_STATUS_CTRL_2_OFFSET,
+					  pos + PCI_EXP_LNKCTL2,
 					  &link_ctrl);
 		if ((link_ctrl & PCI_TARGET_LINK_SPEED_MASK) ==
 		    PCI_TARGET_LINK_SPEED_GEN2) {
 			link_ctrl &= ~PCI_TARGET_LINK_SPEED_MASK;
 			link_ctrl |= PCI_TARGET_LINK_SPEED_GEN1;
 			pci_bus_write_config_dword(bus, 0,
-					   PCI_LINK_STATUS_CTRL_2_OFFSET,
+					   pos + PCI_EXP_LNKCTL2,
 					   link_ctrl);
 			msleep(100);
 
-			pos = pci_bus_find_capability(bus, 0, PCI_CAP_ID_EXP);
 			pci_bus_read_config_word(bus, 0, pos + PCI_EXP_LNKSTA,
 						 &link_status);
 			if (link_status & PCI_EXP_LNKSTA_NLW)
@@ -324,7 +324,7 @@ static int iproc_pcie_check_link(struct iproc_pcie *pcie, struct pci_bus *bus)
 		}
 	}
 
-	dev_info(pcie->dev, "link: %s\n", link_is_active ? "UP" : "DOWN");
+	dev_info(dev, "link: %s\n", link_is_active ? "UP" : "DOWN");
 
 	return link_is_active ? 0 : -ENODEV;
 }
@@ -349,12 +349,13 @@ static int iproc_pcie_setup_ob(struct iproc_pcie *pcie, u64 axi_addr,
 			       u64 pci_addr, resource_size_t size)
 {
 	struct iproc_pcie_ob *ob = &pcie->ob;
+	struct device *dev = pcie->dev;
 	unsigned i;
 	u64 max_size = (u64)ob->window_size * MAX_NUM_OB_WINDOWS;
 	u64 remainder;
 
 	if (size > max_size) {
-		dev_err(pcie->dev,
+		dev_err(dev,
 			"res size %pap exceeds max supported size 0x%llx\n",
 			&size, max_size);
 		return -EINVAL;
@@ -362,15 +363,14 @@ static int iproc_pcie_setup_ob(struct iproc_pcie *pcie, u64 axi_addr,
 
 	div64_u64_rem(size, ob->window_size, &remainder);
 	if (remainder) {
-		dev_err(pcie->dev,
+		dev_err(dev,
 			"res size %pap needs to be multiple of window size %pap\n",
 			&size, &ob->window_size);
 		return -EINVAL;
 	}
 
 	if (axi_addr < ob->axi_offset) {
-		dev_err(pcie->dev,
-			"axi address %pap less than offset %pap\n",
+		dev_err(dev, "axi address %pap less than offset %pap\n",
 			&axi_addr, &ob->axi_offset);
 		return -EINVAL;
 	}
@@ -406,6 +406,7 @@ static int iproc_pcie_setup_ob(struct iproc_pcie *pcie, u64 axi_addr,
 static int iproc_pcie_map_ranges(struct iproc_pcie *pcie,
 				 struct list_head *resources)
 {
+	struct device *dev = pcie->dev;
 	struct resource_entry *window;
 	int ret;
 
@@ -425,7 +426,7 @@ static int iproc_pcie_map_ranges(struct iproc_pcie *pcie,
 				return ret;
 			break;
 		default:
-			dev_err(pcie->dev, "invalid resource %pR\n", res);
+			dev_err(dev, "invalid resource %pR\n", res);
 			return -EINVAL;
 		}
 	}
@@ -455,26 +456,25 @@ static void iproc_pcie_msi_disable(struct iproc_pcie *pcie)
 
 int iproc_pcie_setup(struct iproc_pcie *pcie, struct list_head *res)
 {
+	struct device *dev;
 	int ret;
 	void *sysdata;
 	struct pci_bus *bus;
 
-	if (!pcie || !pcie->dev || !pcie->base)
-		return -EINVAL;
-
-	ret = devm_request_pci_bus_resources(pcie->dev, res);
+	dev = pcie->dev;
+	ret = devm_request_pci_bus_resources(dev, res);
 	if (ret)
 		return ret;
 
 	ret = phy_init(pcie->phy);
 	if (ret) {
-		dev_err(pcie->dev, "unable to initialize PCIe PHY\n");
+		dev_err(dev, "unable to initialize PCIe PHY\n");
 		return ret;
 	}
 
 	ret = phy_power_on(pcie->phy);
 	if (ret) {
-		dev_err(pcie->dev, "unable to power on PCIe PHY\n");
+		dev_err(dev, "unable to power on PCIe PHY\n");
 		goto err_exit_phy;
 	}
 
@@ -486,7 +486,7 @@ int iproc_pcie_setup(struct iproc_pcie *pcie, struct list_head *res)
 		pcie->reg_offsets = iproc_pcie_reg_paxc;
 		break;
 	default:
-		dev_err(pcie->dev, "incompatible iProc PCIe interface\n");
+		dev_err(dev, "incompatible iProc PCIe interface\n");
 		ret = -EINVAL;
 		goto err_power_off_phy;
 	}
@@ -496,7 +496,7 @@ int iproc_pcie_setup(struct iproc_pcie *pcie, struct list_head *res)
 	if (pcie->need_ob_cfg) {
 		ret = iproc_pcie_map_ranges(pcie, res);
 		if (ret) {
-			dev_err(pcie->dev, "map failed\n");
+			dev_err(dev, "map failed\n");
 			goto err_power_off_phy;
 		}
 	}
@@ -508,9 +508,9 @@ int iproc_pcie_setup(struct iproc_pcie *pcie, struct list_head *res)
 	sysdata = pcie;
 #endif
 
-	bus = pci_create_root_bus(pcie->dev, 0, &iproc_pcie_ops, sysdata, res);
+	bus = pci_create_root_bus(dev, 0, &iproc_pcie_ops, sysdata, res);
 	if (!bus) {
-		dev_err(pcie->dev, "unable to create PCI root bus\n");
+		dev_err(dev, "unable to create PCI root bus\n");
 		ret = -ENOMEM;
 		goto err_power_off_phy;
 	}
@@ -518,7 +518,7 @@ int iproc_pcie_setup(struct iproc_pcie *pcie, struct list_head *res)
 
 	ret = iproc_pcie_check_link(pcie, bus);
 	if (ret) {
-		dev_err(pcie->dev, "no PCIe EP device detected\n");
+		dev_err(dev, "no PCIe EP device detected\n");
 		goto err_rm_root_bus;
 	}
 
@@ -526,7 +526,7 @@ int iproc_pcie_setup(struct iproc_pcie *pcie, struct list_head *res)
 
 	if (IS_ENABLED(CONFIG_PCI_MSI))
 		if (iproc_pcie_msi_enable(pcie))
-			dev_info(pcie->dev, "not using iProc MSI\n");
+			dev_info(dev, "not using iProc MSI\n");
 
 	pci_scan_child_bus(bus);
 	pci_assign_unassigned_bus_resources(bus);
diff --git a/drivers/pci/host/pcie-qcom.c b/drivers/pci/host/pcie-qcom.c
index 5ec2d440a6b7..ef0a84c7a588 100644
--- a/drivers/pci/host/pcie-qcom.c
+++ b/drivers/pci/host/pcie-qcom.c
@@ -86,12 +86,10 @@ struct qcom_pcie_ops {
 };
 
 struct qcom_pcie {
-	struct pcie_port pp;
-	struct device *dev;
+	struct pcie_port pp;			/* pp.dbi_base is DT dbi */
+	void __iomem *parf;			/* DT parf */
+	void __iomem *elbi;			/* DT elbi */
 	union qcom_pcie_resources res;
-	void __iomem *parf;
-	void __iomem *dbi;
-	void __iomem *elbi;
 	struct phy *phy;
 	struct gpio_desc *reset;
 	struct qcom_pcie_ops *ops;
@@ -136,7 +134,7 @@ static int qcom_pcie_establish_link(struct qcom_pcie *pcie)
 static int qcom_pcie_get_resources_v0(struct qcom_pcie *pcie)
 {
 	struct qcom_pcie_resources_v0 *res = &pcie->res.v0;
-	struct device *dev = pcie->dev;
+	struct device *dev = pcie->pp.dev;
 
 	res->vdda = devm_regulator_get(dev, "vdda");
 	if (IS_ERR(res->vdda))
@@ -188,7 +186,7 @@ static int qcom_pcie_get_resources_v0(struct qcom_pcie *pcie)
 static int qcom_pcie_get_resources_v1(struct qcom_pcie *pcie)
 {
 	struct qcom_pcie_resources_v1 *res = &pcie->res.v1;
-	struct device *dev = pcie->dev;
+	struct device *dev = pcie->pp.dev;
 
 	res->vdda = devm_regulator_get(dev, "vdda");
 	if (IS_ERR(res->vdda))
@@ -237,7 +235,7 @@ static void qcom_pcie_deinit_v0(struct qcom_pcie *pcie)
 static int qcom_pcie_init_v0(struct qcom_pcie *pcie)
 {
 	struct qcom_pcie_resources_v0 *res = &pcie->res.v0;
-	struct device *dev = pcie->dev;
+	struct device *dev = pcie->pp.dev;
 	u32 val;
 	int ret;
 
@@ -359,7 +357,7 @@ static void qcom_pcie_deinit_v1(struct qcom_pcie *pcie)
 static int qcom_pcie_init_v1(struct qcom_pcie *pcie)
 {
 	struct qcom_pcie_resources_v1 *res = &pcie->res.v1;
-	struct device *dev = pcie->dev;
+	struct device *dev = pcie->pp.dev;
 	int ret;
 
 	ret = reset_control_deassert(res->core);
@@ -426,7 +424,7 @@ err_res:
 static int qcom_pcie_link_up(struct pcie_port *pp)
 {
 	struct qcom_pcie *pcie = to_qcom_pcie(pp);
-	u16 val = readw(pcie->dbi + PCIE20_CAP + PCI_EXP_LNKSTA);
+	u16 val = readw(pcie->pp.dbi_base + PCIE20_CAP + PCI_EXP_LNKSTA);
 
 	return !!(val & PCI_EXP_LNKSTA_DLLLA);
 }
@@ -509,8 +507,8 @@ static int qcom_pcie_probe(struct platform_device *pdev)
 	if (!pcie)
 		return -ENOMEM;
 
+	pp = &pcie->pp;
 	pcie->ops = (struct qcom_pcie_ops *)of_device_get_match_data(dev);
-	pcie->dev = dev;
 
 	pcie->reset = devm_gpiod_get_optional(dev, "perst", GPIOD_OUT_LOW);
 	if (IS_ERR(pcie->reset))
@@ -522,9 +520,9 @@ static int qcom_pcie_probe(struct platform_device *pdev)
 		return PTR_ERR(pcie->parf);
 
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi");
-	pcie->dbi = devm_ioremap_resource(dev, res);
-	if (IS_ERR(pcie->dbi))
-		return PTR_ERR(pcie->dbi);
+	pp->dbi_base = devm_ioremap_resource(dev, res);
+	if (IS_ERR(pp->dbi_base))
+		return PTR_ERR(pp->dbi_base);
 
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "elbi");
 	pcie->elbi = devm_ioremap_resource(dev, res);
@@ -539,9 +537,7 @@ static int qcom_pcie_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	pp = &pcie->pp;
 	pp->dev = dev;
-	pp->dbi_base = pcie->dbi;
 	pp->root_bus_nr = -1;
 	pp->ops = &qcom_pcie_dw_ops;
 
@@ -569,8 +565,6 @@ static int qcom_pcie_probe(struct platform_device *pdev)
 		return ret;
 	}
 
-	platform_set_drvdata(pdev, pcie);
-
 	return 0;
 }
 
diff --git a/drivers/pci/host/pcie-rcar.c b/drivers/pci/host/pcie-rcar.c
index e06b1d3b4dea..62700d1896f4 100644
--- a/drivers/pci/host/pcie-rcar.c
+++ b/drivers/pci/host/pcie-rcar.c
@@ -31,8 +31,6 @@
 #include <linux/pm_runtime.h>
 #include <linux/slab.h>
 
-#define DRV_NAME "rcar-pcie"
-
 #define PCIECAR			0x000010
 #define PCIECCTLR		0x000018
 #define  CONFIG_SEND_ENABLE	(1 << 31)
@@ -397,6 +395,7 @@ static int rcar_pcie_setup(struct list_head *resource, struct rcar_pcie *pci)
 
 static void rcar_pcie_force_speedup(struct rcar_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	unsigned int timeout = 1000;
 	u32 macsr;
 
@@ -404,7 +403,7 @@ static void rcar_pcie_force_speedup(struct rcar_pcie *pcie)
 		return;
 
 	if (rcar_pci_read_reg(pcie, MACCTLR) & SPEED_CHANGE) {
-		dev_err(pcie->dev, "Speed change already in progress\n");
+		dev_err(dev, "Speed change already in progress\n");
 		return;
 	}
 
@@ -433,7 +432,7 @@ static void rcar_pcie_force_speedup(struct rcar_pcie *pcie)
 			rcar_pci_write_reg(pcie, macsr, MACSR);
 
 			if (macsr & SPCHGFAIL)
-				dev_err(pcie->dev, "Speed change failed\n");
+				dev_err(dev, "Speed change failed\n");
 
 			goto done;
 		}
@@ -441,15 +440,16 @@ static void rcar_pcie_force_speedup(struct rcar_pcie *pcie)
 		msleep(1);
 	};
 
-	dev_err(pcie->dev, "Speed change timed out\n");
+	dev_err(dev, "Speed change timed out\n");
 
 done:
-	dev_info(pcie->dev, "Current link speed is %s GT/s\n",
+	dev_info(dev, "Current link speed is %s GT/s\n",
 		 (macsr & LINK_SPEED) == LINK_SPEED_5_0GTS ? "5" : "2.5");
 }
 
 static int rcar_pcie_enable(struct rcar_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	struct pci_bus *bus, *child;
 	LIST_HEAD(res);
 
@@ -461,14 +461,14 @@ static int rcar_pcie_enable(struct rcar_pcie *pcie)
 	pci_add_flags(PCI_REASSIGN_ALL_RSRC | PCI_REASSIGN_ALL_BUS);
 
 	if (IS_ENABLED(CONFIG_PCI_MSI))
-		bus = pci_scan_root_bus_msi(pcie->dev, pcie->root_bus_nr,
+		bus = pci_scan_root_bus_msi(dev, pcie->root_bus_nr,
 				&rcar_pcie_ops, pcie, &res, &pcie->msi.chip);
 	else
-		bus = pci_scan_root_bus(pcie->dev, pcie->root_bus_nr,
+		bus = pci_scan_root_bus(dev, pcie->root_bus_nr,
 				&rcar_pcie_ops, pcie, &res);
 
 	if (!bus) {
-		dev_err(pcie->dev, "Scanning rootbus failed");
+		dev_err(dev, "Scanning rootbus failed");
 		return -ENODEV;
 	}
 
@@ -487,6 +487,7 @@ static int rcar_pcie_enable(struct rcar_pcie *pcie)
 
 static int phy_wait_for_ack(struct rcar_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	unsigned int timeout = 100;
 
 	while (timeout--) {
@@ -496,7 +497,7 @@ static int phy_wait_for_ack(struct rcar_pcie *pcie)
 		udelay(100);
 	}
 
-	dev_err(pcie->dev, "Access to PCIe phy timed out\n");
+	dev_err(dev, "Access to PCIe phy timed out\n");
 
 	return -ETIMEDOUT;
 }
@@ -697,6 +698,7 @@ static irqreturn_t rcar_pcie_msi_irq(int irq, void *data)
 {
 	struct rcar_pcie *pcie = data;
 	struct rcar_msi *msi = &pcie->msi;
+	struct device *dev = pcie->dev;
 	unsigned long reg;
 
 	reg = rcar_pci_read_reg(pcie, PCIEMSIFR);
@@ -717,10 +719,10 @@ static irqreturn_t rcar_pcie_msi_irq(int irq, void *data)
 			if (test_bit(index, msi->used))
 				generic_handle_irq(irq);
 			else
-				dev_info(pcie->dev, "unhandled MSI\n");
+				dev_info(dev, "unhandled MSI\n");
 		} else {
 			/* Unknown MSI, just clear it */
-			dev_dbg(pcie->dev, "unexpected MSI\n");
+			dev_dbg(dev, "unexpected MSI\n");
 		}
 
 		/* see if there's any more pending in this vector */
@@ -843,22 +845,22 @@ static const struct irq_domain_ops msi_domain_ops = {
 
 static int rcar_pcie_enable_msi(struct rcar_pcie *pcie)
 {
-	struct platform_device *pdev = to_platform_device(pcie->dev);
+	struct device *dev = pcie->dev;
 	struct rcar_msi *msi = &pcie->msi;
 	unsigned long base;
 	int err, i;
 
 	mutex_init(&msi->lock);
 
-	msi->chip.dev = pcie->dev;
+	msi->chip.dev = dev;
 	msi->chip.setup_irq = rcar_msi_setup_irq;
 	msi->chip.setup_irqs = rcar_msi_setup_irqs;
 	msi->chip.teardown_irq = rcar_msi_teardown_irq;
 
-	msi->domain = irq_domain_add_linear(pcie->dev->of_node, INT_PCI_MSI_NR,
+	msi->domain = irq_domain_add_linear(dev->of_node, INT_PCI_MSI_NR,
 					    &msi_domain_ops, &msi->chip);
 	if (!msi->domain) {
-		dev_err(&pdev->dev, "failed to create IRQ domain\n");
+		dev_err(dev, "failed to create IRQ domain\n");
 		return -ENOMEM;
 	}
 
@@ -866,19 +868,19 @@ static int rcar_pcie_enable_msi(struct rcar_pcie *pcie)
 		irq_create_mapping(msi->domain, i);
 
 	/* Two irqs are for MSI, but they are also used for non-MSI irqs */
-	err = devm_request_irq(&pdev->dev, msi->irq1, rcar_pcie_msi_irq,
+	err = devm_request_irq(dev, msi->irq1, rcar_pcie_msi_irq,
 			       IRQF_SHARED | IRQF_NO_THREAD,
 			       rcar_msi_irq_chip.name, pcie);
 	if (err < 0) {
-		dev_err(&pdev->dev, "failed to request IRQ: %d\n", err);
+		dev_err(dev, "failed to request IRQ: %d\n", err);
 		goto err;
 	}
 
-	err = devm_request_irq(&pdev->dev, msi->irq2, rcar_pcie_msi_irq,
+	err = devm_request_irq(dev, msi->irq2, rcar_pcie_msi_irq,
 			       IRQF_SHARED | IRQF_NO_THREAD,
 			       rcar_msi_irq_chip.name, pcie);
 	if (err < 0) {
-		dev_err(&pdev->dev, "failed to request IRQ: %d\n", err);
+		dev_err(dev, "failed to request IRQ: %d\n", err);
 		goto err;
 	}
 
@@ -899,32 +901,32 @@ err:
 	return err;
 }
 
-static int rcar_pcie_get_resources(struct platform_device *pdev,
-				   struct rcar_pcie *pcie)
+static int rcar_pcie_get_resources(struct rcar_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	struct resource res;
 	int err, i;
 
-	err = of_address_to_resource(pdev->dev.of_node, 0, &res);
+	err = of_address_to_resource(dev->of_node, 0, &res);
 	if (err)
 		return err;
 
-	pcie->base = devm_ioremap_resource(&pdev->dev, &res);
+	pcie->base = devm_ioremap_resource(dev, &res);
 	if (IS_ERR(pcie->base))
 		return PTR_ERR(pcie->base);
 
-	pcie->clk = devm_clk_get(&pdev->dev, "pcie");
+	pcie->clk = devm_clk_get(dev, "pcie");
 	if (IS_ERR(pcie->clk)) {
-		dev_err(pcie->dev, "cannot get platform clock\n");
+		dev_err(dev, "cannot get platform clock\n");
 		return PTR_ERR(pcie->clk);
 	}
 	err = clk_prepare_enable(pcie->clk);
 	if (err)
 		return err;
 
-	pcie->bus_clk = devm_clk_get(&pdev->dev, "pcie_bus");
+	pcie->bus_clk = devm_clk_get(dev, "pcie_bus");
 	if (IS_ERR(pcie->bus_clk)) {
-		dev_err(pcie->dev, "cannot get pcie bus clock\n");
+		dev_err(dev, "cannot get pcie bus clock\n");
 		err = PTR_ERR(pcie->bus_clk);
 		goto fail_clk;
 	}
@@ -932,17 +934,17 @@ static int rcar_pcie_get_resources(struct platform_device *pdev,
 	if (err)
 		goto fail_clk;
 
-	i = irq_of_parse_and_map(pdev->dev.of_node, 0);
+	i = irq_of_parse_and_map(dev->of_node, 0);
 	if (!i) {
-		dev_err(pcie->dev, "cannot get platform resources for msi interrupt\n");
+		dev_err(dev, "cannot get platform resources for msi interrupt\n");
 		err = -ENOENT;
 		goto err_map_reg;
 	}
 	pcie->msi.irq1 = i;
 
-	i = irq_of_parse_and_map(pdev->dev.of_node, 1);
+	i = irq_of_parse_and_map(dev->of_node, 1);
 	if (!i) {
-		dev_err(pcie->dev, "cannot get platform resources for msi interrupt\n");
+		dev_err(dev, "cannot get platform resources for msi interrupt\n");
 		err = -ENOENT;
 		goto err_map_reg;
 	}
@@ -1119,60 +1121,60 @@ out_release_res:
 
 static int rcar_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct rcar_pcie *pcie;
 	unsigned int data;
 	const struct of_device_id *of_id;
 	int err;
 	int (*hw_init_fn)(struct rcar_pcie *);
 
-	pcie = devm_kzalloc(&pdev->dev, sizeof(*pcie), GFP_KERNEL);
+	pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
 	if (!pcie)
 		return -ENOMEM;
 
-	pcie->dev = &pdev->dev;
-	platform_set_drvdata(pdev, pcie);
+	pcie->dev = dev;
 
 	INIT_LIST_HEAD(&pcie->resources);
 
 	rcar_pcie_parse_request_of_pci_ranges(pcie);
 
-	err = rcar_pcie_get_resources(pdev, pcie);
+	err = rcar_pcie_get_resources(pcie);
 	if (err < 0) {
-		dev_err(&pdev->dev, "failed to request resources: %d\n", err);
+		dev_err(dev, "failed to request resources: %d\n", err);
 		return err;
 	}
 
-	err = rcar_pcie_parse_map_dma_ranges(pcie, pdev->dev.of_node);
+	err = rcar_pcie_parse_map_dma_ranges(pcie, dev->of_node);
 	if (err)
 		return err;
 
-	of_id = of_match_device(rcar_pcie_of_match, pcie->dev);
+	of_id = of_match_device(rcar_pcie_of_match, dev);
 	if (!of_id || !of_id->data)
 		return -EINVAL;
 	hw_init_fn = of_id->data;
 
-	pm_runtime_enable(pcie->dev);
-	err = pm_runtime_get_sync(pcie->dev);
+	pm_runtime_enable(dev);
+	err = pm_runtime_get_sync(dev);
 	if (err < 0) {
-		dev_err(pcie->dev, "pm_runtime_get_sync failed\n");
+		dev_err(dev, "pm_runtime_get_sync failed\n");
 		goto err_pm_disable;
 	}
 
 	/* Failure to get a link might just be that no cards are inserted */
 	err = hw_init_fn(pcie);
 	if (err) {
-		dev_info(&pdev->dev, "PCIe link down\n");
+		dev_info(dev, "PCIe link down\n");
 		err = 0;
 		goto err_pm_put;
 	}
 
 	data = rcar_pci_read_reg(pcie, MACSR);
-	dev_info(&pdev->dev, "PCIe x%d: link up\n", (data >> 20) & 0x3f);
+	dev_info(dev, "PCIe x%d: link up\n", (data >> 20) & 0x3f);
 
 	if (IS_ENABLED(CONFIG_PCI_MSI)) {
 		err = rcar_pcie_enable_msi(pcie);
 		if (err < 0) {
-			dev_err(&pdev->dev,
+			dev_err(dev,
 				"failed to enable MSI support: %d\n",
 				err);
 			goto err_pm_put;
@@ -1186,16 +1188,16 @@ static int rcar_pcie_probe(struct platform_device *pdev)
 	return 0;
 
 err_pm_put:
-	pm_runtime_put(pcie->dev);
+	pm_runtime_put(dev);
 
 err_pm_disable:
-	pm_runtime_disable(pcie->dev);
+	pm_runtime_disable(dev);
 	return err;
 }
 
 static struct platform_driver rcar_pcie_driver = {
 	.driver = {
-		.name = DRV_NAME,
+		.name = "rcar-pcie",
 		.of_match_table = rcar_pcie_of_match,
 		.suppress_bind_attrs = true,
 	},
diff --git a/drivers/pci/host/pcie-rockchip.c b/drivers/pci/host/pcie-rockchip.c
index b8c82fc812dc..e0b22dab9b7a 100644
--- a/drivers/pci/host/pcie-rockchip.c
+++ b/drivers/pci/host/pcie-rockchip.c
@@ -972,7 +972,7 @@ static int rockchip_pcie_prog_ob_atu(struct rockchip_pcie *rockchip,
 		return -EINVAL;
 	if (region_no == 0) {
 		if (AXI_REGION_0_SIZE < (2ULL << num_pass_bits))
-		return -EINVAL;
+			return -EINVAL;
 	}
 	if (region_no != 0) {
 		if (AXI_REGION_SIZE < (2ULL << num_pass_bits))
@@ -1091,8 +1091,6 @@ static int rockchip_pcie_probe(struct platform_device *pdev)
 	if (err)
 		goto err_vpcie;
 
-	platform_set_drvdata(pdev, rockchip);
-
 	rockchip_pcie_enable_interrupts(rockchip);
 
 	err = rockchip_pcie_init_irq_domain(rockchip);
diff --git a/drivers/pci/host/pcie-spear13xx.c b/drivers/pci/host/pcie-spear13xx.c
index 09aed85f275a..3cf197ba7f37 100644
--- a/drivers/pci/host/pcie-spear13xx.c
+++ b/drivers/pci/host/pcie-spear13xx.c
@@ -25,10 +25,10 @@
 #include "pcie-designware.h"
 
 struct spear13xx_pcie {
+	struct pcie_port	pp;		/* DT dbi is pp.dbi_base */
 	void __iomem		*app_base;
 	struct phy		*phy;
 	struct clk		*clk;
-	struct pcie_port	pp;
 	bool			is_gen1;
 };
 
@@ -57,96 +57,26 @@ struct pcie_app_reg {
 };
 
 /* CR0 ID */
-#define RX_LANE_FLIP_EN_ID			0
-#define TX_LANE_FLIP_EN_ID			1
-#define SYS_AUX_PWR_DET_ID			2
 #define APP_LTSSM_ENABLE_ID			3
-#define SYS_ATTEN_BUTTON_PRESSED_ID		4
-#define SYS_MRL_SENSOR_STATE_ID			5
-#define SYS_PWR_FAULT_DET_ID			6
-#define SYS_MRL_SENSOR_CHGED_ID			7
-#define SYS_PRE_DET_CHGED_ID			8
-#define SYS_CMD_CPLED_INT_ID			9
-#define APP_INIT_RST_0_ID			11
-#define APP_REQ_ENTR_L1_ID			12
-#define APP_READY_ENTR_L23_ID			13
-#define APP_REQ_EXIT_L1_ID			14
-#define DEVICE_TYPE_EP				(0 << 25)
-#define DEVICE_TYPE_LEP				(1 << 25)
 #define DEVICE_TYPE_RC				(4 << 25)
-#define SYS_INT_ID				29
 #define MISCTRL_EN_ID				30
 #define REG_TRANSLATION_ENABLE			31
 
-/* CR1 ID */
-#define APPS_PM_XMT_TURNOFF_ID			2
-#define APPS_PM_XMT_PME_ID			5
-
 /* CR3 ID */
-#define XMLH_LTSSM_STATE_DETECT_QUIET		0x00
-#define XMLH_LTSSM_STATE_DETECT_ACT		0x01
-#define XMLH_LTSSM_STATE_POLL_ACTIVE		0x02
-#define XMLH_LTSSM_STATE_POLL_COMPLIANCE	0x03
-#define XMLH_LTSSM_STATE_POLL_CONFIG		0x04
-#define XMLH_LTSSM_STATE_PRE_DETECT_QUIET	0x05
-#define XMLH_LTSSM_STATE_DETECT_WAIT		0x06
-#define XMLH_LTSSM_STATE_CFG_LINKWD_START	0x07
-#define XMLH_LTSSM_STATE_CFG_LINKWD_ACEPT	0x08
-#define XMLH_LTSSM_STATE_CFG_LANENUM_WAIT	0x09
-#define XMLH_LTSSM_STATE_CFG_LANENUM_ACEPT	0x0A
-#define XMLH_LTSSM_STATE_CFG_COMPLETE		0x0B
-#define XMLH_LTSSM_STATE_CFG_IDLE		0x0C
-#define XMLH_LTSSM_STATE_RCVRY_LOCK		0x0D
-#define XMLH_LTSSM_STATE_RCVRY_SPEED		0x0E
-#define XMLH_LTSSM_STATE_RCVRY_RCVRCFG		0x0F
-#define XMLH_LTSSM_STATE_RCVRY_IDLE		0x10
-#define XMLH_LTSSM_STATE_L0			0x11
-#define XMLH_LTSSM_STATE_L0S			0x12
-#define XMLH_LTSSM_STATE_L123_SEND_EIDLE	0x13
-#define XMLH_LTSSM_STATE_L1_IDLE		0x14
-#define XMLH_LTSSM_STATE_L2_IDLE		0x15
-#define XMLH_LTSSM_STATE_L2_WAKE		0x16
-#define XMLH_LTSSM_STATE_DISABLED_ENTRY		0x17
-#define XMLH_LTSSM_STATE_DISABLED_IDLE		0x18
-#define XMLH_LTSSM_STATE_DISABLED		0x19
-#define XMLH_LTSSM_STATE_LPBK_ENTRY		0x1A
-#define XMLH_LTSSM_STATE_LPBK_ACTIVE		0x1B
-#define XMLH_LTSSM_STATE_LPBK_EXIT		0x1C
-#define XMLH_LTSSM_STATE_LPBK_EXIT_TIMEOUT	0x1D
-#define XMLH_LTSSM_STATE_HOT_RESET_ENTRY	0x1E
-#define XMLH_LTSSM_STATE_HOT_RESET		0x1F
-#define XMLH_LTSSM_STATE_MASK			0x3F
 #define XMLH_LINK_UP				(1 << 6)
 
-/* CR4 ID */
-#define CFG_MSI_EN_ID				18
-
 /* CR6 */
-#define INTA_CTRL_INT				(1 << 7)
-#define INTB_CTRL_INT				(1 << 8)
-#define INTC_CTRL_INT				(1 << 9)
-#define INTD_CTRL_INT				(1 << 10)
 #define MSI_CTRL_INT				(1 << 26)
 
-/* CR19 ID */
-#define VEN_MSI_REQ_ID				11
-#define VEN_MSI_FUN_NUM_ID			8
-#define VEN_MSI_TC_ID				5
-#define VEN_MSI_VECTOR_ID			0
-#define VEN_MSI_REQ_EN		((u32)0x1 << VEN_MSI_REQ_ID)
-#define VEN_MSI_FUN_NUM_MASK	((u32)0x7 << VEN_MSI_FUN_NUM_ID)
-#define VEN_MSI_TC_MASK		((u32)0x7 << VEN_MSI_TC_ID)
-#define VEN_MSI_VECTOR_MASK	((u32)0x1F << VEN_MSI_VECTOR_ID)
-
 #define EXP_CAP_ID_OFFSET			0x70
 
 #define to_spear13xx_pcie(x)	container_of(x, struct spear13xx_pcie, pp)
 
-static int spear13xx_pcie_establish_link(struct pcie_port *pp)
+static int spear13xx_pcie_establish_link(struct spear13xx_pcie *spear13xx_pcie)
 {
-	u32 val;
-	struct spear13xx_pcie *spear13xx_pcie = to_spear13xx_pcie(pp);
+	struct pcie_port *pp = &spear13xx_pcie->pp;
 	struct pcie_app_reg *app_reg = spear13xx_pcie->app_base;
+	u32 val;
 	u32 exp_cap_off = EXP_CAP_ID_OFFSET;
 
 	if (dw_pcie_link_up(pp)) {
@@ -203,9 +133,9 @@ static int spear13xx_pcie_establish_link(struct pcie_port *pp)
 
 static irqreturn_t spear13xx_pcie_irq_handler(int irq, void *arg)
 {
-	struct pcie_port *pp = arg;
-	struct spear13xx_pcie *spear13xx_pcie = to_spear13xx_pcie(pp);
+	struct spear13xx_pcie *spear13xx_pcie = arg;
 	struct pcie_app_reg *app_reg = spear13xx_pcie->app_base;
+	struct pcie_port *pp = &spear13xx_pcie->pp;
 	unsigned int status;
 
 	status = readl(&app_reg->int_sts);
@@ -220,9 +150,9 @@ static irqreturn_t spear13xx_pcie_irq_handler(int irq, void *arg)
 	return IRQ_HANDLED;
 }
 
-static void spear13xx_pcie_enable_interrupts(struct pcie_port *pp)
+static void spear13xx_pcie_enable_interrupts(struct spear13xx_pcie *spear13xx_pcie)
 {
-	struct spear13xx_pcie *spear13xx_pcie = to_spear13xx_pcie(pp);
+	struct pcie_port *pp = &spear13xx_pcie->pp;
 	struct pcie_app_reg *app_reg = spear13xx_pcie->app_base;
 
 	/* Enable MSI interrupt */
@@ -246,8 +176,10 @@ static int spear13xx_pcie_link_up(struct pcie_port *pp)
 
 static void spear13xx_pcie_host_init(struct pcie_port *pp)
 {
-	spear13xx_pcie_establish_link(pp);
-	spear13xx_pcie_enable_interrupts(pp);
+	struct spear13xx_pcie *spear13xx_pcie = to_spear13xx_pcie(pp);
+
+	spear13xx_pcie_establish_link(spear13xx_pcie);
+	spear13xx_pcie_enable_interrupts(spear13xx_pcie);
 }
 
 static struct pcie_host_ops spear13xx_pcie_host_ops = {
@@ -255,10 +187,11 @@ static struct pcie_host_ops spear13xx_pcie_host_ops = {
 	.host_init = spear13xx_pcie_host_init,
 };
 
-static int spear13xx_add_pcie_port(struct pcie_port *pp,
-					 struct platform_device *pdev)
+static int spear13xx_add_pcie_port(struct spear13xx_pcie *spear13xx_pcie,
+				   struct platform_device *pdev)
 {
-	struct device *dev = &pdev->dev;
+	struct pcie_port *pp = &spear13xx_pcie->pp;
+	struct device *dev = pp->dev;
 	int ret;
 
 	pp->irq = platform_get_irq(pdev, 0);
@@ -268,7 +201,7 @@ static int spear13xx_add_pcie_port(struct pcie_port *pp,
 	}
 	ret = devm_request_irq(dev, pp->irq, spear13xx_pcie_irq_handler,
 			       IRQF_SHARED | IRQF_NO_THREAD,
-			       "spear1340-pcie", pp);
+			       "spear1340-pcie", spear13xx_pcie);
 	if (ret) {
 		dev_err(dev, "failed to request irq %d\n", pp->irq);
 		return ret;
@@ -288,10 +221,10 @@ static int spear13xx_add_pcie_port(struct pcie_port *pp,
 
 static int spear13xx_pcie_probe(struct platform_device *pdev)
 {
+	struct device *dev = &pdev->dev;
 	struct spear13xx_pcie *spear13xx_pcie;
 	struct pcie_port *pp;
-	struct device *dev = &pdev->dev;
-	struct device_node *np = pdev->dev.of_node;
+	struct device_node *np = dev->of_node;
 	struct resource *dbi_base;
 	int ret;
 
@@ -323,7 +256,6 @@ static int spear13xx_pcie_probe(struct platform_device *pdev)
 	}
 
 	pp = &spear13xx_pcie->pp;
-
 	pp->dev = dev;
 
 	dbi_base = platform_get_resource_byname(pdev, IORESOURCE_MEM, "dbi");
@@ -338,7 +270,7 @@ static int spear13xx_pcie_probe(struct platform_device *pdev)
 	if (of_property_read_bool(np, "st,pcie-is-gen1"))
 		spear13xx_pcie->is_gen1 = true;
 
-	ret = spear13xx_add_pcie_port(pp, pdev);
+	ret = spear13xx_add_pcie_port(spear13xx_pcie, pdev);
 	if (ret < 0)
 		goto fail_clk;
 
diff --git a/drivers/pci/host/pcie-xilinx-nwl.c b/drivers/pci/host/pcie-xilinx-nwl.c
index 67eae4179290..43eaa4afab94 100644
--- a/drivers/pci/host/pcie-xilinx-nwl.c
+++ b/drivers/pci/host/pcie-xilinx-nwl.c
@@ -212,6 +212,7 @@ static bool nwl_phy_link_up(struct nwl_pcie *pcie)
 
 static int nwl_wait_for_link(struct nwl_pcie *pcie)
 {
+	struct device *dev = pcie->dev;
 	int retries;
 
 	/* check if the link is up or not */
@@ -221,7 +222,7 @@ static int nwl_wait_for_link(struct nwl_pcie *pcie)
 		usleep_range(LINK_WAIT_USLEEP_MIN, LINK_WAIT_USLEEP_MAX);
 	}
 
-	dev_err(pcie->dev, "PHY link never came up\n");
+	dev_err(dev, "PHY link never came up\n");
 	return -ETIMEDOUT;
 }
 
@@ -277,6 +278,7 @@ static struct pci_ops nwl_pcie_ops = {
 static irqreturn_t nwl_pcie_misc_handler(int irq, void *data)
 {
 	struct nwl_pcie *pcie = data;
+	struct device *dev = pcie->dev;
 	u32 misc_stat;
 
 	/* Checking for misc interrupts */
@@ -286,45 +288,43 @@ static irqreturn_t nwl_pcie_misc_handler(int irq, void *data)
 		return IRQ_NONE;
 
 	if (misc_stat & MSGF_MISC_SR_RXMSG_OVER)
-		dev_err(pcie->dev, "Received Message FIFO Overflow\n");
+		dev_err(dev, "Received Message FIFO Overflow\n");
 
 	if (misc_stat & MSGF_MISC_SR_SLAVE_ERR)
-		dev_err(pcie->dev, "Slave error\n");
+		dev_err(dev, "Slave error\n");
 
 	if (misc_stat & MSGF_MISC_SR_MASTER_ERR)
-		dev_err(pcie->dev, "Master error\n");
+		dev_err(dev, "Master error\n");
 
 	if (misc_stat & MSGF_MISC_SR_I_ADDR_ERR)
-		dev_err(pcie->dev,
-			"In Misc Ingress address translation error\n");
+		dev_err(dev, "In Misc Ingress address translation error\n");
 
 	if (misc_stat & MSGF_MISC_SR_E_ADDR_ERR)
-		dev_err(pcie->dev,
-			"In Misc Egress address translation error\n");
+		dev_err(dev, "In Misc Egress address translation error\n");
 
 	if (misc_stat & MSGF_MISC_SR_FATAL_AER)
-		dev_err(pcie->dev, "Fatal Error in AER Capability\n");
+		dev_err(dev, "Fatal Error in AER Capability\n");
 
 	if (misc_stat & MSGF_MISC_SR_NON_FATAL_AER)
-		dev_err(pcie->dev, "Non-Fatal Error in AER Capability\n");
+		dev_err(dev, "Non-Fatal Error in AER Capability\n");
 
 	if (misc_stat & MSGF_MISC_SR_CORR_AER)
-		dev_err(pcie->dev, "Correctable Error in AER Capability\n");
+		dev_err(dev, "Correctable Error in AER Capability\n");
 
 	if (misc_stat & MSGF_MISC_SR_UR_DETECT)
-		dev_err(pcie->dev, "Unsupported request Detected\n");
+		dev_err(dev, "Unsupported request Detected\n");
 
 	if (misc_stat & MSGF_MISC_SR_NON_FATAL_DEV)
-		dev_err(pcie->dev, "Non-Fatal Error Detected\n");
+		dev_err(dev, "Non-Fatal Error Detected\n");
 
 	if (misc_stat & MSGF_MISC_SR_FATAL_DEV)
-		dev_err(pcie->dev, "Fatal Error Detected\n");
+		dev_err(dev, "Fatal Error Detected\n");
 
 	if (misc_stat & MSGF_MSIC_SR_LINK_AUTO_BWIDTH)
-		dev_info(pcie->dev, "Link Autonomous Bandwidth Management Status bit set\n");
+		dev_info(dev, "Link Autonomous Bandwidth Management Status bit set\n");
 
 	if (misc_stat & MSGF_MSIC_SR_LINK_BWIDTH)
-		dev_info(pcie->dev, "Link Bandwidth Management Status bit set\n");
+		dev_info(dev, "Link Bandwidth Management Status bit set\n");
 
 	/* Clear misc interrupt status */
 	nwl_bridge_writel(pcie, misc_stat, MSGF_MISC_STATUS);
@@ -494,20 +494,21 @@ static const struct irq_domain_ops dev_msi_domain_ops = {
 static int nwl_pcie_init_msi_irq_domain(struct nwl_pcie *pcie)
 {
 #ifdef CONFIG_PCI_MSI
-	struct fwnode_handle *fwnode = of_node_to_fwnode(pcie->dev->of_node);
+	struct device *dev = pcie->dev;
+	struct fwnode_handle *fwnode = of_node_to_fwnode(dev->of_node);
 	struct nwl_msi *msi = &pcie->msi;
 
 	msi->dev_domain = irq_domain_add_linear(NULL, INT_PCI_MSI_NR,
 						&dev_msi_domain_ops, pcie);
 	if (!msi->dev_domain) {
-		dev_err(pcie->dev, "failed to create dev IRQ domain\n");
+		dev_err(dev, "failed to create dev IRQ domain\n");
 		return -ENOMEM;
 	}
 	msi->msi_domain = pci_msi_create_irq_domain(fwnode,
 						    &nwl_msi_domain_info,
 						    msi->dev_domain);
 	if (!msi->msi_domain) {
-		dev_err(pcie->dev, "failed to create msi IRQ domain\n");
+		dev_err(dev, "failed to create msi IRQ domain\n");
 		irq_domain_remove(msi->dev_domain);
 		return -ENOMEM;
 	}
@@ -517,12 +518,13 @@ static int nwl_pcie_init_msi_irq_domain(struct nwl_pcie *pcie)
 
 static int nwl_pcie_init_irq_domain(struct nwl_pcie *pcie)
 {
-	struct device_node *node = pcie->dev->of_node;
+	struct device *dev = pcie->dev;
+	struct device_node *node = dev->of_node;
 	struct device_node *legacy_intc_node;
 
 	legacy_intc_node = of_get_next_child(node, NULL);
 	if (!legacy_intc_node) {
-		dev_err(pcie->dev, "No legacy intc node found\n");
+		dev_err(dev, "No legacy intc node found\n");
 		return -EINVAL;
 	}
 
@@ -532,7 +534,7 @@ static int nwl_pcie_init_irq_domain(struct nwl_pcie *pcie)
 							pcie);
 
 	if (!pcie->legacy_irq_domain) {
-		dev_err(pcie->dev, "failed to create IRQ domain\n");
+		dev_err(dev, "failed to create IRQ domain\n");
 		return -ENOMEM;
 	}
 
@@ -542,7 +544,8 @@ static int nwl_pcie_init_irq_domain(struct nwl_pcie *pcie)
 
 static int nwl_pcie_enable_msi(struct nwl_pcie *pcie, struct pci_bus *bus)
 {
-	struct platform_device *pdev = to_platform_device(pcie->dev);
+	struct device *dev = pcie->dev;
+	struct platform_device *pdev = to_platform_device(dev);
 	struct nwl_msi *msi = &pcie->msi;
 	unsigned long base;
 	int ret;
@@ -557,7 +560,7 @@ static int nwl_pcie_enable_msi(struct nwl_pcie *pcie, struct pci_bus *bus)
 	/* Get msi_1 IRQ number */
 	msi->irq_msi1 = platform_get_irq_byname(pdev, "msi1");
 	if (msi->irq_msi1 < 0) {
-		dev_err(&pdev->dev, "failed to get IRQ#%d\n", msi->irq_msi1);
+		dev_err(dev, "failed to get IRQ#%d\n", msi->irq_msi1);
 		ret = -EINVAL;
 		goto err;
 	}
@@ -568,7 +571,7 @@ static int nwl_pcie_enable_msi(struct nwl_pcie *pcie, struct pci_bus *bus)
 	/* Get msi_0 IRQ number */
 	msi->irq_msi0 = platform_get_irq_byname(pdev, "msi0");
 	if (msi->irq_msi0 < 0) {
-		dev_err(&pdev->dev, "failed to get IRQ#%d\n", msi->irq_msi0);
+		dev_err(dev, "failed to get IRQ#%d\n", msi->irq_msi0);
 		ret = -EINVAL;
 		goto err;
 	}
@@ -579,7 +582,7 @@ static int nwl_pcie_enable_msi(struct nwl_pcie *pcie, struct pci_bus *bus)
 	/* Check for msii_present bit */
 	ret = nwl_bridge_readl(pcie, I_MSII_CAPABILITIES) & MSII_PRESENT;
 	if (!ret) {
-		dev_err(pcie->dev, "MSI not present\n");
+		dev_err(dev, "MSI not present\n");
 		ret = -EIO;
 		goto err;
 	}
@@ -628,13 +631,14 @@ err:
 
 static int nwl_pcie_bridge_init(struct nwl_pcie *pcie)
 {
-	struct platform_device *pdev = to_platform_device(pcie->dev);
+	struct device *dev = pcie->dev;
+	struct platform_device *pdev = to_platform_device(dev);
 	u32 breg_val, ecam_val, first_busno = 0;
 	int err;
 
 	breg_val = nwl_bridge_readl(pcie, E_BREG_CAPABILITIES) & BREG_PRESENT;
 	if (!breg_val) {
-		dev_err(pcie->dev, "BREG is not present\n");
+		dev_err(dev, "BREG is not present\n");
 		return breg_val;
 	}
 
@@ -665,7 +669,7 @@ static int nwl_pcie_bridge_init(struct nwl_pcie *pcie)
 
 	ecam_val = nwl_bridge_readl(pcie, E_ECAM_CAPABILITIES) & E_ECAM_PRESENT;
 	if (!ecam_val) {
-		dev_err(pcie->dev, "ECAM is not present\n");
+		dev_err(dev, "ECAM is not present\n");
 		return ecam_val;
 	}
 
@@ -692,23 +696,23 @@ static int nwl_pcie_bridge_init(struct nwl_pcie *pcie)
 	writel(ecam_val, (pcie->ecam_base + PCI_PRIMARY_BUS));
 
 	if (nwl_pcie_link_up(pcie))
-		dev_info(pcie->dev, "Link is UP\n");
+		dev_info(dev, "Link is UP\n");
 	else
-		dev_info(pcie->dev, "Link is DOWN\n");
+		dev_info(dev, "Link is DOWN\n");
 
 	/* Get misc IRQ number */
 	pcie->irq_misc = platform_get_irq_byname(pdev, "misc");
 	if (pcie->irq_misc < 0) {
-		dev_err(&pdev->dev, "failed to get misc IRQ %d\n",
+		dev_err(dev, "failed to get misc IRQ %d\n",
 			pcie->irq_misc);
 		return -EINVAL;
 	}
 
-	err = devm_request_irq(pcie->dev, pcie->irq_misc,
+	err = devm_request_irq(dev, pcie->irq_misc,
 			       nwl_pcie_misc_handler, IRQF_SHARED,
 			       "nwl_pcie:misc", pcie);
 	if (err) {
-		dev_err(pcie->dev, "fail to register misc IRQ#%d\n",
+		dev_err(dev, "fail to register misc IRQ#%d\n",
 			pcie->irq_misc);
 		return err;
 	}
@@ -744,31 +748,32 @@ static int nwl_pcie_bridge_init(struct nwl_pcie *pcie)
 static int nwl_pcie_parse_dt(struct nwl_pcie *pcie,
 			     struct platform_device *pdev)
 {
-	struct device_node *node = pcie->dev->of_node;
+	struct device *dev = pcie->dev;
+	struct device_node *node = dev->of_node;
 	struct resource *res;
 	const char *type;
 
 	/* Check for device type */
 	type = of_get_property(node, "device_type", NULL);
 	if (!type || strcmp(type, "pci")) {
-		dev_err(pcie->dev, "invalid \"device_type\" %s\n", type);
+		dev_err(dev, "invalid \"device_type\" %s\n", type);
 		return -EINVAL;
 	}
 
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "breg");
-	pcie->breg_base = devm_ioremap_resource(pcie->dev, res);
+	pcie->breg_base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(pcie->breg_base))
 		return PTR_ERR(pcie->breg_base);
 	pcie->phys_breg_base = res->start;
 
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "pcireg");
-	pcie->pcireg_base = devm_ioremap_resource(pcie->dev, res);
+	pcie->pcireg_base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(pcie->pcireg_base))
 		return PTR_ERR(pcie->pcireg_base);
 	pcie->phys_pcie_reg_base = res->start;
 
 	res = platform_get_resource_byname(pdev, IORESOURCE_MEM, "cfg");
-	pcie->ecam_base = devm_ioremap_resource(pcie->dev, res);
+	pcie->ecam_base = devm_ioremap_resource(dev, res);
 	if (IS_ERR(pcie->ecam_base))
 		return PTR_ERR(pcie->ecam_base);
 	pcie->phys_ecam_base = res->start;
@@ -776,8 +781,7 @@ static int nwl_pcie_parse_dt(struct nwl_pcie *pcie,
 	/* Get intx IRQ number */
 	pcie->irq_intx = platform_get_irq_byname(pdev, "intx");
 	if (pcie->irq_intx < 0) {
-		dev_err(&pdev->dev, "failed to get intx IRQ %d\n",
-			pcie->irq_intx);
+		dev_err(dev, "failed to get intx IRQ %d\n", pcie->irq_intx);
 		return -EINVAL;
 	}
 
@@ -794,7 +798,8 @@ static const struct of_device_id nwl_pcie_of_match[] = {
 
 static int nwl_pcie_probe(struct platform_device *pdev)
 {
-	struct device_node *node = pdev->dev.of_node;
+	struct device *dev = &pdev->dev;
+	struct device_node *node = dev->of_node;
 	struct nwl_pcie *pcie;
 	struct pci_bus *bus;
 	struct pci_bus *child;
@@ -802,42 +807,42 @@ static int nwl_pcie_probe(struct platform_device *pdev)
 	resource_size_t iobase = 0;
 	LIST_HEAD(res);
 
-	pcie = devm_kzalloc(&pdev->dev, sizeof(*pcie), GFP_KERNEL);
+	pcie = devm_kzalloc(dev, sizeof(*pcie), GFP_KERNEL);
 	if (!pcie)
 		return -ENOMEM;
 
-	pcie->dev = &pdev->dev;
+	pcie->dev = dev;
 	pcie->ecam_value = NWL_ECAM_VALUE_DEFAULT;
 
 	err = nwl_pcie_parse_dt(pcie, pdev);
 	if (err) {
-		dev_err(pcie->dev, "Parsing DT failed\n");
+		dev_err(dev, "Parsing DT failed\n");
 		return err;
 	}
 
 	err = nwl_pcie_bridge_init(pcie);
 	if (err) {
-		dev_err(pcie->dev, "HW Initialization failed\n");
+		dev_err(dev, "HW Initialization failed\n");
 		return err;
 	}
 
 	err = of_pci_get_host_bridge_resources(node, 0, 0xff, &res, &iobase);
 	if (err) {
-		dev_err(pcie->dev, "Getting bridge resources failed\n");
+		dev_err(dev, "Getting bridge resources failed\n");
 		return err;
 	}
 
-	err = devm_request_pci_bus_resources(pcie->dev, &res);
+	err = devm_request_pci_bus_resources(dev, &res);
 	if (err)
 		goto error;
 
 	err = nwl_pcie_init_irq_domain(pcie);
 	if (err) {
-		dev_err(pcie->dev, "Failed creating IRQ Domain\n");
+		dev_err(dev, "Failed creating IRQ Domain\n");
 		goto error;
 	}
 
-	bus = pci_create_root_bus(&pdev->dev, pcie->root_busno,
+	bus = pci_create_root_bus(dev, pcie->root_busno,
 				  &nwl_pcie_ops, pcie, &res);
 	if (!bus) {
 		err = -ENOMEM;
@@ -847,8 +852,7 @@ static int nwl_pcie_probe(struct platform_device *pdev)
 	if (IS_ENABLED(CONFIG_PCI_MSI)) {
 		err = nwl_pcie_enable_msi(pcie, bus);
 		if (err < 0) {
-			dev_err(&pdev->dev,
-				"failed to enable MSI support: %d\n", err);
+			dev_err(dev, "failed to enable MSI support: %d\n", err);
 			goto error;
 		}
 	}
@@ -857,7 +861,6 @@ static int nwl_pcie_probe(struct platform_device *pdev)
 	list_for_each_entry(child, &bus->children, node)
 		pcie_bus_configure_settings(child);
 	pci_bus_add_devices(bus);
-	platform_set_drvdata(pdev, pcie);
 	return 0;
 
 error:
diff --git a/drivers/pci/host/pcie-xilinx.c b/drivers/pci/host/pcie-xilinx.c
index be568039d9d0..c8616fadccf1 100644
--- a/drivers/pci/host/pcie-xilinx.c
+++ b/drivers/pci/host/pcie-xilinx.c
@@ -140,10 +140,11 @@ static inline bool xilinx_pcie_link_is_up(struct xilinx_pcie_port *port)
  */
 static void xilinx_pcie_clear_err_interrupts(struct xilinx_pcie_port *port)
 {
+	struct device *dev = port->dev;
 	unsigned long val = pcie_read(port, XILINX_PCIE_REG_RPEFR);
 
 	if (val & XILINX_PCIE_RPEFR_ERR_VALID) {
-		dev_dbg(port->dev, "Requester ID %lu\n",
+		dev_dbg(dev, "Requester ID %lu\n",
 			val & XILINX_PCIE_RPEFR_REQ_ID);
 		pcie_write(port, XILINX_PCIE_RPEFR_ALL_MASK,
 			   XILINX_PCIE_REG_RPEFR);
@@ -228,11 +229,10 @@ static void xilinx_pcie_destroy_msi(unsigned int irq)
 
 /**
  * xilinx_pcie_assign_msi - Allocate MSI number
- * @port: PCIe port structure
  *
  * Return: A valid IRQ on success and error value on failure.
  */
-static int xilinx_pcie_assign_msi(struct xilinx_pcie_port *port)
+static int xilinx_pcie_assign_msi(void)
 {
 	int pos;
 
@@ -275,7 +275,7 @@ static int xilinx_pcie_msi_setup_irq(struct msi_controller *chip,
 	struct msi_msg msg;
 	phys_addr_t msg_addr;
 
-	hwirq = xilinx_pcie_assign_msi(port);
+	hwirq = xilinx_pcie_assign_msi();
 	if (hwirq < 0)
 		return hwirq;
 
@@ -383,6 +383,7 @@ static const struct irq_domain_ops intx_domain_ops = {
 static irqreturn_t xilinx_pcie_intr_handler(int irq, void *data)
 {
 	struct xilinx_pcie_port *port = (struct xilinx_pcie_port *)data;
+	struct device *dev = port->dev;
 	u32 val, mask, status, msi_data;
 
 	/* Read interrupt decode and mask registers */
@@ -394,32 +395,32 @@ static irqreturn_t xilinx_pcie_intr_handler(int irq, void *data)
 		return IRQ_NONE;
 
 	if (status & XILINX_PCIE_INTR_LINK_DOWN)
-		dev_warn(port->dev, "Link Down\n");
+		dev_warn(dev, "Link Down\n");
 
 	if (status & XILINX_PCIE_INTR_ECRC_ERR)
-		dev_warn(port->dev, "ECRC failed\n");
+		dev_warn(dev, "ECRC failed\n");
 
 	if (status & XILINX_PCIE_INTR_STR_ERR)
-		dev_warn(port->dev, "Streaming error\n");
+		dev_warn(dev, "Streaming error\n");
 
 	if (status & XILINX_PCIE_INTR_HOT_RESET)
-		dev_info(port->dev, "Hot reset\n");
+		dev_info(dev, "Hot reset\n");
 
 	if (status & XILINX_PCIE_INTR_CFG_TIMEOUT)
-		dev_warn(port->dev, "ECAM access timeout\n");
+		dev_warn(dev, "ECAM access timeout\n");
 
 	if (status & XILINX_PCIE_INTR_CORRECTABLE) {
-		dev_warn(port->dev, "Correctable error message\n");
+		dev_warn(dev, "Correctable error message\n");
 		xilinx_pcie_clear_err_interrupts(port);
 	}
 
 	if (status & XILINX_PCIE_INTR_NONFATAL) {
-		dev_warn(port->dev, "Non fatal error message\n");
+		dev_warn(dev, "Non fatal error message\n");
 		xilinx_pcie_clear_err_interrupts(port);
 	}
 
 	if (status & XILINX_PCIE_INTR_FATAL) {
-		dev_warn(port->dev, "Fatal error message\n");
+		dev_warn(dev, "Fatal error message\n");
 		xilinx_pcie_clear_err_interrupts(port);
 	}
 
@@ -429,7 +430,7 @@ static irqreturn_t xilinx_pcie_intr_handler(int irq, void *data)
 
 		/* Check whether interrupt valid */
 		if (!(val & XILINX_PCIE_RPIFR1_INTR_VALID)) {
-			dev_warn(port->dev, "RP Intr FIFO1 read error\n");
+			dev_warn(dev, "RP Intr FIFO1 read error\n");
 			goto error;
 		}
 
@@ -451,7 +452,7 @@ static irqreturn_t xilinx_pcie_intr_handler(int irq, void *data)
 		val = pcie_read(port, XILINX_PCIE_REG_RPIFR1);
 
 		if (!(val & XILINX_PCIE_RPIFR1_INTR_VALID)) {
-			dev_warn(port->dev, "RP Intr FIFO1 read error\n");
+			dev_warn(dev, "RP Intr FIFO1 read error\n");
 			goto error;
 		}
 
@@ -471,31 +472,31 @@ static irqreturn_t xilinx_pcie_intr_handler(int irq, void *data)
 	}
 
 	if (status & XILINX_PCIE_INTR_SLV_UNSUPP)
-		dev_warn(port->dev, "Slave unsupported request\n");
+		dev_warn(dev, "Slave unsupported request\n");
 
 	if (status & XILINX_PCIE_INTR_SLV_UNEXP)
-		dev_warn(port->dev, "Slave unexpected completion\n");
+		dev_warn(dev, "Slave unexpected completion\n");
 
 	if (status & XILINX_PCIE_INTR_SLV_COMPL)
-		dev_warn(port->dev, "Slave completion timeout\n");
+		dev_warn(dev, "Slave completion timeout\n");
 
 	if (status & XILINX_PCIE_INTR_SLV_ERRP)
-		dev_warn(port->dev, "Slave Error Poison\n");
+		dev_warn(dev, "Slave Error Poison\n");
 
 	if (status & XILINX_PCIE_INTR_SLV_CMPABT)
-		dev_warn(port->dev, "Slave Completer Abort\n");
+		dev_warn(dev, "Slave Completer Abort\n");
 
 	if (status & XILINX_PCIE_INTR_SLV_ILLBUR)
-		dev_warn(port->dev, "Slave Illegal Burst\n");
+		dev_warn(dev, "Slave Illegal Burst\n");
 
 	if (status & XILINX_PCIE_INTR_MST_DECERR)
-		dev_warn(port->dev, "Master decode error\n");
+		dev_warn(dev, "Master decode error\n");
 
 	if (status & XILINX_PCIE_INTR_MST_SLVERR)
-		dev_warn(port->dev, "Master slave error\n");
+		dev_warn(dev, "Master slave error\n");
 
 	if (status & XILINX_PCIE_INTR_MST_ERRP)
-		dev_warn(port->dev, "Master error poison\n");
+		dev_warn(dev, "Master error poison\n");
 
 error:
 	/* Clear the Interrupt Decode register */
@@ -554,10 +555,12 @@ static int xilinx_pcie_init_irq_domain(struct xilinx_pcie_port *port)
  */
 static void xilinx_pcie_init_port(struct xilinx_pcie_port *port)
 {
+	struct device *dev = port->dev;
+
 	if (xilinx_pcie_link_is_up(port))
-		dev_info(port->dev, "PCIe Link is UP\n");
+		dev_info(dev, "PCIe Link is UP\n");
 	else
-		dev_info(port->dev, "PCIe Link is DOWN\n");
+		dev_info(dev, "PCIe Link is DOWN\n");
 
 	/* Disable all interrupts */
 	pcie_write(port, ~XILINX_PCIE_IDR_ALL_MASK,
@@ -627,8 +630,8 @@ static int xilinx_pcie_parse_dt(struct xilinx_pcie_port *port)
  */
 static int xilinx_pcie_probe(struct platform_device *pdev)
 {
-	struct xilinx_pcie_port *port;
 	struct device *dev = &pdev->dev;
+	struct xilinx_pcie_port *port;
 	struct pci_bus *bus;
 	int err;
 	resource_size_t iobase = 0;
@@ -668,15 +671,14 @@ static int xilinx_pcie_probe(struct platform_device *pdev)
 	if (err)
 		goto error;
 
-	bus = pci_create_root_bus(&pdev->dev, 0,
-				  &xilinx_pcie_ops, port, &res);
+	bus = pci_create_root_bus(dev, 0, &xilinx_pcie_ops, port, &res);
 	if (!bus) {
 		err = -ENOMEM;
 		goto error;
 	}
 
 #ifdef CONFIG_PCI_MSI
-	xilinx_pcie_msi_chip.dev = port->dev;
+	xilinx_pcie_msi_chip.dev = dev;
 	bus->msi = &xilinx_pcie_msi_chip;
 #endif
 	pci_scan_child_bus(bus);
@@ -685,8 +687,6 @@ static int xilinx_pcie_probe(struct platform_device *pdev)
 	pci_fixup_irqs(pci_common_swizzle, of_irq_parse_and_map_pci);
 #endif
 	pci_bus_add_devices(bus);
-	platform_set_drvdata(pdev, port);
-
 	return 0;
 
 error:
diff --git a/drivers/platform/x86/acerhdf.c b/drivers/platform/x86/acerhdf.c
index 460fa6708bfc..2acdb0d6ea89 100644
--- a/drivers/platform/x86/acerhdf.c
+++ b/drivers/platform/x86/acerhdf.c
@@ -405,7 +405,7 @@ static inline void acerhdf_enable_kernelmode(void)
 	kernelmode = 1;
 
 	thz_dev->polling_delay = interval*1000;
-	thermal_zone_device_update(thz_dev);
+	thermal_zone_device_update(thz_dev, THERMAL_EVENT_UNSPECIFIED);
 	pr_notice("kernel mode fan control ON\n");
 }
 
diff --git a/drivers/platform/x86/asus-laptop.c b/drivers/platform/x86/asus-laptop.c
index 15f131146501..28551f5a2e07 100644
--- a/drivers/platform/x86/asus-laptop.c
+++ b/drivers/platform/x86/asus-laptop.c
@@ -932,30 +932,19 @@ static ssize_t infos_show(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RO(infos);
 
-static int parse_arg(const char *buf, unsigned long count, int *val)
-{
-	if (!count)
-		return 0;
-	if (count > 31)
-		return -EINVAL;
-	if (sscanf(buf, "%i", val) != 1)
-		return -EINVAL;
-	return count;
-}
-
 static ssize_t sysfs_acpi_set(struct asus_laptop *asus,
 			      const char *buf, size_t count,
 			      const char *method)
 {
 	int rv, value;
 
-	rv = parse_arg(buf, count, &value);
-	if (rv <= 0)
+	rv = kstrtoint(buf, 0, &value);
+	if (rv < 0)
 		return rv;
 
 	if (write_acpi_int(asus->handle, method, value))
 		return -ENODEV;
-	return rv;
+	return count;
 }
 
 /*
@@ -975,15 +964,17 @@ static ssize_t ledd_store(struct device *dev, struct device_attribute *attr,
 	struct asus_laptop *asus = dev_get_drvdata(dev);
 	int rv, value;
 
-	rv = parse_arg(buf, count, &value);
-	if (rv > 0) {
-		if (write_acpi_int(asus->handle, METHOD_LEDD, value)) {
-			pr_warn("LED display write failed\n");
-			return -ENODEV;
-		}
-		asus->ledd_status = (u32) value;
+	rv = kstrtoint(buf, 0, &value);
+	if (rv < 0)
+		return rv;
+
+	if (write_acpi_int(asus->handle, METHOD_LEDD, value)) {
+		pr_warn("LED display write failed\n");
+		return -ENODEV;
 	}
-	return rv;
+
+	asus->ledd_status = (u32) value;
+	return count;
 }
 static DEVICE_ATTR_RW(ledd);
 
@@ -1148,10 +1139,12 @@ static ssize_t display_store(struct device *dev, struct device_attribute *attr,
 	struct asus_laptop *asus = dev_get_drvdata(dev);
 	int rv, value;
 
-	rv = parse_arg(buf, count, &value);
-	if (rv > 0)
-		asus_set_display(asus, value);
-	return rv;
+	rv = kstrtoint(buf, 0, &value);
+	if (rv < 0)
+		return rv;
+
+	asus_set_display(asus, value);
+	return count;
 }
 static DEVICE_ATTR_WO(display);
 
@@ -1190,11 +1183,12 @@ static ssize_t ls_switch_store(struct device *dev,
 	struct asus_laptop *asus = dev_get_drvdata(dev);
 	int rv, value;
 
-	rv = parse_arg(buf, count, &value);
-	if (rv > 0)
-		asus_als_switch(asus, value ? 1 : 0);
+	rv = kstrtoint(buf, 0, &value);
+	if (rv < 0)
+		return rv;
 
-	return rv;
+	asus_als_switch(asus, value ? 1 : 0);
+	return count;
 }
 static DEVICE_ATTR_RW(ls_switch);
 
@@ -1219,14 +1213,15 @@ static ssize_t ls_level_store(struct device *dev, struct device_attribute *attr,
 	struct asus_laptop *asus = dev_get_drvdata(dev);
 	int rv, value;
 
-	rv = parse_arg(buf, count, &value);
-	if (rv > 0) {
-		value = (0 < value) ? ((15 < value) ? 15 : value) : 0;
-		/* 0 <= value <= 15 */
-		asus_als_level(asus, value);
-	}
+	rv = kstrtoint(buf, 0, &value);
+	if (rv < 0)
+		return rv;
+
+	value = (0 < value) ? ((15 < value) ? 15 : value) : 0;
+	/* 0 <= value <= 15 */
+	asus_als_level(asus, value);
 
-	return rv;
+	return count;
 }
 static DEVICE_ATTR_RW(ls_level);
 
@@ -1301,14 +1296,14 @@ static ssize_t gps_store(struct device *dev, struct device_attribute *attr,
 	int rv, value;
 	int ret;
 
-	rv = parse_arg(buf, count, &value);
-	if (rv <= 0)
-		return -EINVAL;
+	rv = kstrtoint(buf, 0, &value);
+	if (rv < 0)
+		return rv;
 	ret = asus_gps_switch(asus, !!value);
 	if (ret)
 		return ret;
 	rfkill_set_sw_state(asus->gps.rfkill, !value);
-	return rv;
+	return count;
 }
 static DEVICE_ATTR_RW(gps);
 
diff --git a/drivers/platform/x86/asus-nb-wmi.c b/drivers/platform/x86/asus-nb-wmi.c
index adecc1c555f0..26e4cbc34db8 100644
--- a/drivers/platform/x86/asus-nb-wmi.c
+++ b/drivers/platform/x86/asus-nb-wmi.c
@@ -27,6 +27,7 @@
 #include <linux/input/sparse-keymap.h>
 #include <linux/fb.h>
 #include <linux/dmi.h>
+#include <linux/i8042.h>
 
 #include "asus-wmi.h"
 
@@ -55,10 +56,34 @@ MODULE_PARM_DESC(wapf, "WAPF value");
 
 static struct quirk_entry *quirks;
 
+static bool asus_q500a_i8042_filter(unsigned char data, unsigned char str,
+			      struct serio *port)
+{
+	static bool extended;
+	bool ret = false;
+
+	if (str & I8042_STR_AUXDATA)
+		return false;
+
+	if (unlikely(data == 0xe1)) {
+		extended = true;
+		ret = true;
+	} else if (unlikely(extended)) {
+		extended = false;
+		ret = true;
+	}
+
+	return ret;
+}
+
 static struct quirk_entry quirk_asus_unknown = {
 	.wapf = 0,
 };
 
+static struct quirk_entry quirk_asus_q500a = {
+	.i8042_filter = asus_q500a_i8042_filter,
+};
+
 /*
  * For those machines that need software to control bt/wifi status
  * and can't adjust brightness through ACPI interface
@@ -87,6 +112,10 @@ static struct quirk_entry quirk_no_rfkill_wapf4 = {
 	.no_rfkill = true,
 };
 
+static struct quirk_entry quirk_asus_ux303ub = {
+	.wmi_backlight_native = true,
+};
+
 static int dmi_matched(const struct dmi_system_id *dmi)
 {
 	quirks = dmi->driver_data;
@@ -96,6 +125,15 @@ static int dmi_matched(const struct dmi_system_id *dmi)
 static const struct dmi_system_id asus_quirks[] = {
 	{
 		.callback = dmi_matched,
+		.ident = "ASUSTeK COMPUTER INC. Q500A",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "Q500A"),
+		},
+		.driver_data = &quirk_asus_q500a,
+	},
+	{
+		.callback = dmi_matched,
 		.ident = "ASUSTeK COMPUTER INC. U32U",
 		.matches = {
 			DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK Computer Inc."),
@@ -351,11 +389,22 @@ static const struct dmi_system_id asus_quirks[] = {
 		},
 		.driver_data = &quirk_no_rfkill,
 	},
+	{
+		.callback = dmi_matched,
+		.ident = "ASUSTeK COMPUTER INC. UX303UB",
+		.matches = {
+			DMI_MATCH(DMI_SYS_VENDOR, "ASUSTeK COMPUTER INC."),
+			DMI_MATCH(DMI_PRODUCT_NAME, "UX303UB"),
+		},
+		.driver_data = &quirk_asus_ux303ub,
+	},
 	{},
 };
 
 static void asus_nb_wmi_quirks(struct asus_wmi_driver *driver)
 {
+	int ret;
+
 	quirks = &quirk_asus_unknown;
 	dmi_check_system(asus_quirks);
 
@@ -367,6 +416,15 @@ static void asus_nb_wmi_quirks(struct asus_wmi_driver *driver)
 		quirks->wapf = wapf;
 	else
 		wapf = quirks->wapf;
+
+	if (quirks->i8042_filter) {
+		ret = i8042_install_filter(quirks->i8042_filter);
+		if (ret) {
+			pr_warn("Unable to install key filter\n");
+			return;
+		}
+		pr_info("Using i8042 filter function for receiving events\n");
+	}
 }
 
 static const struct key_entry asus_nb_wmi_keymap[] = {
diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c
index 7c093a0b78bb..ce6ca31a2d09 100644
--- a/drivers/platform/x86/asus-wmi.c
+++ b/drivers/platform/x86/asus-wmi.c
@@ -2084,6 +2084,9 @@ static int asus_wmi_add(struct platform_device *pdev)
 	if (asus->driver->quirks->wmi_backlight_power)
 		acpi_video_set_dmi_backlight_type(acpi_backlight_vendor);
 
+	if (asus->driver->quirks->wmi_backlight_native)
+		acpi_video_set_dmi_backlight_type(acpi_backlight_native);
+
 	if (acpi_video_get_backlight_type() == acpi_backlight_vendor) {
 		err = asus_wmi_backlight_init(asus);
 		if (err && err != -ENODEV)
diff --git a/drivers/platform/x86/asus-wmi.h b/drivers/platform/x86/asus-wmi.h
index 5de1df510ebd..0e19014e9f54 100644
--- a/drivers/platform/x86/asus-wmi.h
+++ b/drivers/platform/x86/asus-wmi.h
@@ -28,6 +28,7 @@
 #define _ASUS_WMI_H_
 
 #include <linux/platform_device.h>
+#include <linux/i8042.h>
 
 #define ASUS_WMI_KEY_IGNORE (-1)
 #define ASUS_WMI_BRN_DOWN	0x20
@@ -43,6 +44,7 @@ struct quirk_entry {
 	bool scalar_panel_brightness;
 	bool store_backlight_power;
 	bool wmi_backlight_power;
+	bool wmi_backlight_native;
 	int wapf;
 	/*
 	 * For machines with AMD graphic chips, it will send out WMI event
@@ -51,6 +53,9 @@ struct quirk_entry {
 	 * and let the ACPI interrupt to send out the key event.
 	 */
 	int no_display_toggle;
+
+	bool (*i8042_filter)(unsigned char data, unsigned char str,
+			     struct serio *serio);
 };
 
 struct asus_wmi_driver {
diff --git a/drivers/platform/x86/dell-smo8800.c b/drivers/platform/x86/dell-smo8800.c
index 0aec4fd4c48e..37e646034ef8 100644
--- a/drivers/platform/x86/dell-smo8800.c
+++ b/drivers/platform/x86/dell-smo8800.c
@@ -24,6 +24,7 @@
 #include <linux/acpi.h>
 #include <linux/interrupt.h>
 #include <linux/miscdevice.h>
+#include <linux/uaccess.h>
 
 struct smo8800_device {
 	u32 irq;                     /* acpi device irq */
diff --git a/drivers/platform/x86/intel_pmc_core.c b/drivers/platform/x86/intel_pmc_core.c
index 520b58a04daa..e8b1b836ca2d 100644
--- a/drivers/platform/x86/intel_pmc_core.c
+++ b/drivers/platform/x86/intel_pmc_core.c
@@ -100,7 +100,7 @@ static int pmc_core_dbgfs_register(struct pmc_dev *pmcdev)
 	struct dentry *dir, *file;
 
 	dir = debugfs_create_dir("pmc_core", NULL);
-	if (IS_ERR_OR_NULL(dir))
+	if (!dir)
 		return -ENOMEM;
 
 	pmcdev->dbgfs_dir = dir;
diff --git a/drivers/platform/x86/intel_pmc_ipc.c b/drivers/platform/x86/intel_pmc_ipc.c
index a511d518206b..0bf51d574fa9 100644
--- a/drivers/platform/x86/intel_pmc_ipc.c
+++ b/drivers/platform/x86/intel_pmc_ipc.c
@@ -522,48 +522,36 @@ static struct resource telemetry_res[] = {
 static int ipc_create_punit_device(void)
 {
 	struct platform_device *pdev;
-	int ret;
-
-	pdev = platform_device_alloc(PUNIT_DEVICE_NAME, -1);
-	if (!pdev) {
-		dev_err(ipcdev.dev, "Failed to alloc punit platform device\n");
-		return -ENOMEM;
-	}
-
-	pdev->dev.parent = ipcdev.dev;
-	ret = platform_device_add_resources(pdev, punit_res_array,
-					    ARRAY_SIZE(punit_res_array));
-	if (ret) {
-		dev_err(ipcdev.dev, "Failed to add platform punit resources\n");
-		goto err;
-	}
+	const struct platform_device_info pdevinfo = {
+		.parent = ipcdev.dev,
+		.name = PUNIT_DEVICE_NAME,
+		.id = -1,
+		.res = punit_res_array,
+		.num_res = ARRAY_SIZE(punit_res_array),
+		};
+
+	pdev = platform_device_register_full(&pdevinfo);
+	if (IS_ERR(pdev))
+		return PTR_ERR(pdev);
 
-	ret = platform_device_add(pdev);
-	if (ret) {
-		dev_err(ipcdev.dev, "Failed to add punit platform device\n");
-		goto err;
-	}
 	ipcdev.punit_dev = pdev;
 
 	return 0;
-err:
-	platform_device_put(pdev);
-	return ret;
 }
 
 static int ipc_create_tco_device(void)
 {
 	struct platform_device *pdev;
 	struct resource *res;
-	int ret;
-
-	pdev = platform_device_alloc(TCO_DEVICE_NAME, -1);
-	if (!pdev) {
-		dev_err(ipcdev.dev, "Failed to alloc tco platform device\n");
-		return -ENOMEM;
-	}
-
-	pdev->dev.parent = ipcdev.dev;
+	const struct platform_device_info pdevinfo = {
+		.parent = ipcdev.dev,
+		.name = TCO_DEVICE_NAME,
+		.id = -1,
+		.res = tco_res,
+		.num_res = ARRAY_SIZE(tco_res),
+		.data = &tco_info,
+		.size_data = sizeof(tco_info),
+		};
 
 	res = tco_res + TCO_RESOURCE_ACPI_IO;
 	res->start = ipcdev.acpi_io_base + TCO_BASE_OFFSET;
@@ -577,45 +565,26 @@ static int ipc_create_tco_device(void)
 	res->start = ipcdev.gcr_base + TCO_PMC_OFFSET;
 	res->end = res->start + TCO_PMC_SIZE - 1;
 
-	ret = platform_device_add_resources(pdev, tco_res, ARRAY_SIZE(tco_res));
-	if (ret) {
-		dev_err(ipcdev.dev, "Failed to add tco platform resources\n");
-		goto err;
-	}
+	pdev = platform_device_register_full(&pdevinfo);
+	if (IS_ERR(pdev))
+		return PTR_ERR(pdev);
 
-	ret = platform_device_add_data(pdev, &tco_info, sizeof(tco_info));
-	if (ret) {
-		dev_err(ipcdev.dev, "Failed to add tco platform data\n");
-		goto err;
-	}
-
-	ret = platform_device_add(pdev);
-	if (ret) {
-		dev_err(ipcdev.dev, "Failed to add tco platform device\n");
-		goto err;
-	}
 	ipcdev.tco_dev = pdev;
 
 	return 0;
-err:
-	platform_device_put(pdev);
-	return ret;
 }
 
 static int ipc_create_telemetry_device(void)
 {
 	struct platform_device *pdev;
 	struct resource *res;
-	int ret;
-
-	pdev = platform_device_alloc(TELEMETRY_DEVICE_NAME, -1);
-	if (!pdev) {
-		dev_err(ipcdev.dev,
-			"Failed to allocate telemetry platform device\n");
-		return -ENOMEM;
-	}
-
-	pdev->dev.parent = ipcdev.dev;
+	const struct platform_device_info pdevinfo = {
+		.parent = ipcdev.dev,
+		.name = TELEMETRY_DEVICE_NAME,
+		.id = -1,
+		.res = telemetry_res,
+		.num_res = ARRAY_SIZE(telemetry_res),
+		};
 
 	res = telemetry_res + TELEMETRY_RESOURCE_PUNIT_SSRAM;
 	res->start = ipcdev.telem_punit_ssram_base;
@@ -625,26 +594,13 @@ static int ipc_create_telemetry_device(void)
 	res->start = ipcdev.telem_pmc_ssram_base;
 	res->end = res->start + ipcdev.telem_pmc_ssram_size - 1;
 
-	ret = platform_device_add_resources(pdev, telemetry_res,
-					    ARRAY_SIZE(telemetry_res));
-	if (ret) {
-		dev_err(ipcdev.dev,
-			"Failed to add telemetry platform resources\n");
-		goto err;
-	}
+	pdev = platform_device_register_full(&pdevinfo);
+	if (IS_ERR(pdev))
+		return PTR_ERR(pdev);
 
-	ret = platform_device_add(pdev);
-	if (ret) {
-		dev_err(ipcdev.dev,
-			"Failed to add telemetry platform device\n");
-		goto err;
-	}
 	ipcdev.telemetry_dev = pdev;
 
 	return 0;
-err:
-	platform_device_put(pdev);
-	return ret;
 }
 
 static int ipc_create_pmc_devices(void)
diff --git a/drivers/platform/x86/toshiba_acpi.c b/drivers/platform/x86/toshiba_acpi.c
index 9d60a40d8b3f..074bf2fa1c55 100644
--- a/drivers/platform/x86/toshiba_acpi.c
+++ b/drivers/platform/x86/toshiba_acpi.c
@@ -321,10 +321,9 @@ static int write_acpi_int(const char *methodName, int val)
 static acpi_status tci_raw(struct toshiba_acpi_dev *dev,
 			   const u32 in[TCI_WORDS], u32 out[TCI_WORDS])
 {
+	union acpi_object in_objs[TCI_WORDS], out_objs[TCI_WORDS + 1];
 	struct acpi_object_list params;
-	union acpi_object in_objs[TCI_WORDS];
 	struct acpi_buffer results;
-	union acpi_object out_objs[TCI_WORDS + 1];
 	acpi_status status;
 	int i;
 
@@ -387,9 +386,8 @@ static int sci_open(struct toshiba_acpi_dev *dev)
 {
 	u32 in[TCI_WORDS] = { SCI_OPEN, 0, 0, 0, 0, 0 };
 	u32 out[TCI_WORDS];
-	acpi_status status;
+	acpi_status status = tci_raw(dev, in, out);
 
-	status = tci_raw(dev, in, out);
 	if  (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to open SCI failed\n");
 		return 0;
@@ -425,9 +423,8 @@ static void sci_close(struct toshiba_acpi_dev *dev)
 {
 	u32 in[TCI_WORDS] = { SCI_CLOSE, 0, 0, 0, 0, 0 };
 	u32 out[TCI_WORDS];
-	acpi_status status;
+	acpi_status status = tci_raw(dev, in, out);
 
-	status = tci_raw(dev, in, out);
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to close SCI failed\n");
 		return;
@@ -479,10 +476,15 @@ static void toshiba_illumination_available(struct toshiba_acpi_dev *dev)
 
 	status = tci_raw(dev, in, out);
 	sci_close(dev);
-	if (ACPI_FAILURE(status))
+	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to query Illumination support failed\n");
-	else if (out[0] == TOS_SUCCESS)
-		dev->illumination_supported = 1;
+		return;
+	}
+
+	if (out[0] != TOS_SUCCESS)
+		return;
+
+	dev->illumination_supported = 1;
 }
 
 static void toshiba_illumination_set(struct led_classdev *cdev,
@@ -509,7 +511,8 @@ static enum led_brightness toshiba_illumination_get(struct led_classdev *cdev)
 {
 	struct toshiba_acpi_dev *dev = container_of(cdev,
 			struct toshiba_acpi_dev, led_dev);
-	u32 state, result;
+	u32 result;
+	u32 state;
 
 	/* First request : initialize communication. */
 	if (!sci_open(dev))
@@ -546,24 +549,28 @@ static void toshiba_kbd_illum_available(struct toshiba_acpi_dev *dev)
 	sci_close(dev);
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to query kbd illumination support failed\n");
-	} else if (out[0] == TOS_SUCCESS) {
-		/*
-		 * Check for keyboard backlight timeout max value,
-		 * previous kbd backlight implementation set this to
-		 * 0x3c0003, and now the new implementation set this
-		 * to 0x3c001a, use this to distinguish between them.
-		 */
-		if (out[3] == SCI_KBD_TIME_MAX)
-			dev->kbd_type = 2;
-		else
-			dev->kbd_type = 1;
-		/* Get the current keyboard backlight mode */
-		dev->kbd_mode = out[2] & SCI_KBD_MODE_MASK;
-		/* Get the current time (1-60 seconds) */
-		dev->kbd_time = out[2] >> HCI_MISC_SHIFT;
-		/* Flag as supported */
-		dev->kbd_illum_supported = 1;
+		return;
 	}
+
+	if (out[0] != TOS_SUCCESS)
+		return;
+
+	/*
+	 * Check for keyboard backlight timeout max value,
+	 * previous kbd backlight implementation set this to
+	 * 0x3c0003, and now the new implementation set this
+	 * to 0x3c001a, use this to distinguish between them.
+	 */
+	if (out[3] == SCI_KBD_TIME_MAX)
+		dev->kbd_type = 2;
+	else
+		dev->kbd_type = 1;
+	/* Get the current keyboard backlight mode */
+	dev->kbd_mode = out[2] & SCI_KBD_MODE_MASK;
+	/* Get the current time (1-60 seconds) */
+	dev->kbd_time = out[2] >> HCI_MISC_SHIFT;
+	/* Flag as supported */
+	dev->kbd_illum_supported = 1;
 }
 
 static int toshiba_kbd_illum_status_set(struct toshiba_acpi_dev *dev, u32 time)
@@ -672,9 +679,9 @@ static int toshiba_touchpad_get(struct toshiba_acpi_dev *dev, u32 *state)
 /* Eco Mode support */
 static void toshiba_eco_mode_available(struct toshiba_acpi_dev *dev)
 {
-	acpi_status status;
 	u32 in[TCI_WORDS] = { HCI_GET, HCI_ECO_MODE, 0, 0, 0, 0 };
 	u32 out[TCI_WORDS];
+	acpi_status status;
 
 	dev->eco_supported = 0;
 	dev->eco_led_registered = false;
@@ -682,7 +689,10 @@ static void toshiba_eco_mode_available(struct toshiba_acpi_dev *dev)
 	status = tci_raw(dev, in, out);
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to get ECO led failed\n");
-	} else if (out[0] == TOS_INPUT_DATA_ERROR) {
+		return;
+	}
+
+	if (out[0] == TOS_INPUT_DATA_ERROR) {
 		/*
 		 * If we receive 0x8300 (Input Data Error), it means that the
 		 * LED device is present, but that we just screwed the input
@@ -694,10 +704,15 @@ static void toshiba_eco_mode_available(struct toshiba_acpi_dev *dev)
 		 */
 		in[3] = 1;
 		status = tci_raw(dev, in, out);
-		if (ACPI_FAILURE(status))
+		if (ACPI_FAILURE(status)) {
 			pr_err("ACPI call to get ECO led failed\n");
-		else if (out[0] == TOS_SUCCESS)
-			dev->eco_supported = 1;
+			return;
+		}
+
+		if (out[0] != TOS_SUCCESS)
+			return;
+
+		dev->eco_supported = 1;
 	}
 }
 
@@ -714,10 +729,11 @@ toshiba_eco_mode_get_status(struct led_classdev *cdev)
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to get ECO led failed\n");
 		return LED_OFF;
-	} else if (out[0] != TOS_SUCCESS) {
-		return LED_OFF;
 	}
 
+	if (out[0] != TOS_SUCCESS)
+		return LED_OFF;
+
 	return out[2] ? LED_FULL : LED_OFF;
 }
 
@@ -751,10 +767,15 @@ static void toshiba_accelerometer_available(struct toshiba_acpi_dev *dev)
 	 * this call also serves as initialization
 	 */
 	status = tci_raw(dev, in, out);
-	if (ACPI_FAILURE(status))
+	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to query the accelerometer failed\n");
-	else if (out[0] == TOS_SUCCESS)
-		dev->accelerometer_supported = 1;
+		return;
+	}
+
+	if (out[0] != TOS_SUCCESS)
+		return;
+
+	dev->accelerometer_supported = 1;
 }
 
 static int toshiba_accelerometer_get(struct toshiba_acpi_dev *dev,
@@ -769,15 +790,18 @@ static int toshiba_accelerometer_get(struct toshiba_acpi_dev *dev,
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to query the accelerometer failed\n");
 		return -EIO;
-	} else if (out[0] == TOS_NOT_SUPPORTED) {
-		return -ENODEV;
-	} else if (out[0] == TOS_SUCCESS) {
-		*xy = out[2];
-		*z = out[4];
-		return 0;
 	}
 
-	return -EIO;
+	if (out[0] == TOS_NOT_SUPPORTED)
+		return -ENODEV;
+
+	if (out[0] != TOS_SUCCESS)
+		return -EIO;
+
+	*xy = out[2];
+	*z = out[4];
+
+	return 0;
 }
 
 /* Sleep (Charge and Music) utilities support */
@@ -797,24 +821,29 @@ static void toshiba_usb_sleep_charge_available(struct toshiba_acpi_dev *dev)
 		pr_err("ACPI call to get USB Sleep and Charge mode failed\n");
 		sci_close(dev);
 		return;
-	} else if (out[0] == TOS_NOT_SUPPORTED) {
+	}
+
+	if (out[0] != TOS_SUCCESS) {
 		sci_close(dev);
 		return;
-	} else if (out[0] == TOS_SUCCESS) {
-		dev->usbsc_mode_base = out[4];
 	}
 
+	dev->usbsc_mode_base = out[4];
+
 	in[5] = SCI_USB_CHARGE_BAT_LVL;
 	status = tci_raw(dev, in, out);
 	sci_close(dev);
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to get USB Sleep and Charge mode failed\n");
-	} else if (out[0] == TOS_SUCCESS) {
-		dev->usbsc_bat_level = out[2];
-		/* Flag as supported */
-		dev->usb_sleep_charge_supported = 1;
+		return;
 	}
 
+	if (out[0] != TOS_SUCCESS)
+		return;
+
+	dev->usbsc_bat_level = out[2];
+	/* Flag as supported */
+	dev->usb_sleep_charge_supported = 1;
 }
 
 static int toshiba_usb_sleep_charge_get(struct toshiba_acpi_dev *dev,
@@ -868,14 +897,19 @@ static int toshiba_sleep_functions_status_get(struct toshiba_acpi_dev *dev,
 	sci_close(dev);
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to get USB S&C battery level failed\n");
-	} else if (out[0] == TOS_NOT_SUPPORTED) {
-		return -ENODEV;
-	} else if (out[0] == TOS_SUCCESS) {
-		*mode = out[2];
-		return 0;
+		return -EIO;
 	}
 
-	return -EIO;
+	if (out[0] == TOS_NOT_SUPPORTED)
+		return -ENODEV;
+
+	if (out[0] != TOS_SUCCESS)
+		return -EIO;
+
+	*mode = out[2];
+
+	return 0;
+
 }
 
 static int toshiba_sleep_functions_status_set(struct toshiba_acpi_dev *dev,
@@ -892,9 +926,12 @@ static int toshiba_sleep_functions_status_set(struct toshiba_acpi_dev *dev,
 	in[5] = SCI_USB_CHARGE_BAT_LVL;
 	status = tci_raw(dev, in, out);
 	sci_close(dev);
-	if (ACPI_FAILURE(status))
+	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to set USB S&C battery level failed\n");
-	else if (out[0] == TOS_NOT_SUPPORTED)
+		return -EIO;
+	}
+
+	if (out[0] == TOS_NOT_SUPPORTED)
 		return -ENODEV;
 
 	return out[0] == TOS_SUCCESS ? 0 : -EIO;
@@ -915,14 +952,18 @@ static int toshiba_usb_rapid_charge_get(struct toshiba_acpi_dev *dev,
 	sci_close(dev);
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to get USB Rapid Charge failed\n");
-	} else if (out[0] == TOS_NOT_SUPPORTED) {
-		return -ENODEV;
-	} else if (out[0] == TOS_SUCCESS || out[0] == TOS_SUCCESS2) {
-		*state = out[2];
-		return 0;
+		return -EIO;
 	}
 
-	return -EIO;
+	if (out[0] == TOS_NOT_SUPPORTED)
+		return -ENODEV;
+
+	if (out[0] != TOS_SUCCESS && out[0] != TOS_SUCCESS2)
+		return -EIO;
+
+	*state = out[2];
+
+	return 0;
 }
 
 static int toshiba_usb_rapid_charge_set(struct toshiba_acpi_dev *dev,
@@ -939,9 +980,12 @@ static int toshiba_usb_rapid_charge_set(struct toshiba_acpi_dev *dev,
 	in[5] = SCI_USB_CHARGE_RAPID_DSP;
 	status = tci_raw(dev, in, out);
 	sci_close(dev);
-	if (ACPI_FAILURE(status))
+	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to set USB Rapid Charge failed\n");
-	else if (out[0] == TOS_NOT_SUPPORTED)
+		return -EIO;
+	}
+
+	if (out[0] == TOS_NOT_SUPPORTED)
 		return -ENODEV;
 
 	return (out[0] == TOS_SUCCESS || out[0] == TOS_SUCCESS2) ? 0 : -EIO;
@@ -1097,14 +1141,18 @@ static int toshiba_hotkey_event_type_get(struct toshiba_acpi_dev *dev,
 	status = tci_raw(dev, in, out);
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to get System type failed\n");
-	} else if (out[0] == TOS_NOT_SUPPORTED) {
-		return -ENODEV;
-	} else if (out[0] == TOS_SUCCESS) {
-		*type = out[3];
-		return 0;
+		return -EIO;
 	}
 
-	return -EIO;
+	if (out[0] == TOS_NOT_SUPPORTED)
+		return -ENODEV;
+
+	if (out[0] != TOS_SUCCESS)
+		return -EIO;
+
+	*type = out[3];
+
+	return 0;
 }
 
 /* Wireless status (RFKill, WLAN, BT, WWAN) */
@@ -1154,7 +1202,6 @@ static void toshiba_wwan_available(struct toshiba_acpi_dev *dev)
 	 */
 	in[3] = HCI_WIRELESS_WWAN;
 	status = tci_raw(dev, in, out);
-
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to get WWAN status failed\n");
 		return;
@@ -1174,7 +1221,6 @@ static int toshiba_wwan_set(struct toshiba_acpi_dev *dev, u32 state)
 
 	in[3] = HCI_WIRELESS_WWAN_STATUS;
 	status = tci_raw(dev, in, out);
-
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to set WWAN status failed\n");
 		return -EIO;
@@ -1193,7 +1239,6 @@ static int toshiba_wwan_set(struct toshiba_acpi_dev *dev, u32 state)
 	 */
 	in[3] = HCI_WIRELESS_WWAN_POWER;
 	status = tci_raw(dev, in, out);
-
 	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to set WWAN power failed\n");
 		return -EIO;
@@ -1216,8 +1261,10 @@ static void toshiba_cooling_method_available(struct toshiba_acpi_dev *dev)
 	dev->max_cooling_method = 0;
 
 	status = tci_raw(dev, in, out);
-	if (ACPI_FAILURE(status))
+	if (ACPI_FAILURE(status)) {
 		pr_err("ACPI call to get Cooling Method failed\n");
+		return;
+	}
 
 	if (out[0] != TOS_SUCCESS && out[0] != TOS_SUCCESS2)
 		return;
@@ -1244,7 +1291,7 @@ static int toshiba_cooling_method_set(struct toshiba_acpi_dev *dev, u32 state)
 	u32 result = hci_write(dev, HCI_COOLING_METHOD, state);
 
 	if (result == TOS_FAILURE)
-		pr_err("ACPI call to get Cooling Method failed\n");
+		pr_err("ACPI call to set Cooling Method failed\n");
 
 	if (result == TOS_NOT_SUPPORTED)
 		return -ENODEV;
@@ -1282,9 +1329,9 @@ static struct proc_dir_entry *toshiba_proc_dir;
 /* LCD Brightness */
 static int __get_lcd_brightness(struct toshiba_acpi_dev *dev)
 {
+	int brightness = 0;
 	u32 result;
 	u32 value;
-	int brightness = 0;
 
 	if (dev->tr_backlight_supported) {
 		int ret = get_tr_backlight_status(dev, &value);
@@ -1301,10 +1348,10 @@ static int __get_lcd_brightness(struct toshiba_acpi_dev *dev)
 		pr_err("ACPI call to get LCD Brightness failed\n");
 	else if (result == TOS_NOT_SUPPORTED)
 		return -ENODEV;
-	if (result == TOS_SUCCESS)
-		return brightness + (value >> HCI_LCD_BRIGHTNESS_SHIFT);
 
-	return -EIO;
+	return result == TOS_SUCCESS ?
+			brightness + (value >> HCI_LCD_BRIGHTNESS_SHIFT) :
+			-EIO;
 }
 
 static int get_lcd_brightness(struct backlight_device *bd)
@@ -1325,15 +1372,15 @@ static int lcd_proc_show(struct seq_file *m, void *v)
 
 	levels = dev->backlight_dev->props.max_brightness + 1;
 	value = get_lcd_brightness(dev->backlight_dev);
-	if (value >= 0) {
-		seq_printf(m, "brightness:              %d\n", value);
-		seq_printf(m, "brightness_levels:       %d\n", levels);
-		return 0;
+	if (value < 0) {
+		pr_err("Error reading LCD brightness\n");
+		return value;
 	}
 
-	pr_err("Error reading LCD brightness\n");
+	seq_printf(m, "brightness:              %d\n", value);
+	seq_printf(m, "brightness_levels:       %d\n", levels);
 
-	return -EIO;
+	return 0;
 }
 
 static int lcd_proc_open(struct inode *inode, struct file *file)
@@ -1377,7 +1424,7 @@ static ssize_t lcd_proc_write(struct file *file, const char __user *buf,
 	struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file));
 	char cmd[42];
 	size_t len;
-	int levels = dev->backlight_dev->props.max_brightness + 1;
+	int levels;
 	int value;
 
 	len = min(count, sizeof(cmd) - 1);
@@ -1385,6 +1432,7 @@ static ssize_t lcd_proc_write(struct file *file, const char __user *buf,
 		return -EFAULT;
 	cmd[len] = '\0';
 
+	levels = dev->backlight_dev->props.max_brightness + 1;
 	if (sscanf(cmd, " brightness : %i", &value) != 1 &&
 	    value < 0 && value > levels)
 		return -EINVAL;
@@ -1420,20 +1468,21 @@ static int get_video_status(struct toshiba_acpi_dev *dev, u32 *status)
 static int video_proc_show(struct seq_file *m, void *v)
 {
 	struct toshiba_acpi_dev *dev = m->private;
+	int is_lcd, is_crt, is_tv;
 	u32 value;
 
-	if (!get_video_status(dev, &value)) {
-		int is_lcd = (value & HCI_VIDEO_OUT_LCD) ? 1 : 0;
-		int is_crt = (value & HCI_VIDEO_OUT_CRT) ? 1 : 0;
-		int is_tv = (value & HCI_VIDEO_OUT_TV) ? 1 : 0;
+	if (get_video_status(dev, &value))
+		return -EIO;
 
-		seq_printf(m, "lcd_out:                 %d\n", is_lcd);
-		seq_printf(m, "crt_out:                 %d\n", is_crt);
-		seq_printf(m, "tv_out:                  %d\n", is_tv);
-		return 0;
-	}
+	is_lcd = (value & HCI_VIDEO_OUT_LCD) ? 1 : 0;
+	is_crt = (value & HCI_VIDEO_OUT_CRT) ? 1 : 0;
+	is_tv = (value & HCI_VIDEO_OUT_TV) ? 1 : 0;
 
-	return -EIO;
+	seq_printf(m, "lcd_out:                 %d\n", is_lcd);
+	seq_printf(m, "crt_out:                 %d\n", is_crt);
+	seq_printf(m, "tv_out:                  %d\n", is_tv);
+
+	return 0;
 }
 
 static int video_proc_open(struct inode *inode, struct file *file)
@@ -1447,10 +1496,8 @@ static ssize_t video_proc_write(struct file *file, const char __user *buf,
 	struct toshiba_acpi_dev *dev = PDE_DATA(file_inode(file));
 	char *buffer;
 	char *cmd;
+	int lcd_out, crt_out, tv_out;
 	int remain = count;
-	int lcd_out = -1;
-	int crt_out = -1;
-	int tv_out = -1;
 	int value;
 	int ret;
 	u32 video_out;
@@ -1486,6 +1533,7 @@ static ssize_t video_proc_write(struct file *file, const char __user *buf,
 
 	kfree(cmd);
 
+	lcd_out = crt_out = tv_out = -1;
 	ret = get_video_status(dev, &video_out);
 	if (!ret) {
 		unsigned int new_video_out = video_out;
@@ -1980,8 +2028,8 @@ static ssize_t usb_sleep_charge_store(struct device *dev,
 				      const char *buf, size_t count)
 {
 	struct toshiba_acpi_dev *toshiba = dev_get_drvdata(dev);
-	u32 mode;
 	int state;
+	u32 mode;
 	int ret;
 
 	ret = kstrtoint(buf, 0, &state);
@@ -2021,9 +2069,8 @@ static ssize_t sleep_functions_on_battery_show(struct device *dev,
 					       char *buf)
 {
 	struct toshiba_acpi_dev *toshiba = dev_get_drvdata(dev);
+	int bat_lvl, status;
 	u32 state;
-	int bat_lvl;
-	int status;
 	int ret;
 	int tmp;
 
diff --git a/drivers/platform/x86/toshiba_bluetooth.c b/drivers/platform/x86/toshiba_bluetooth.c
index 5db495dd018e..be1d137c6079 100644
--- a/drivers/platform/x86/toshiba_bluetooth.c
+++ b/drivers/platform/x86/toshiba_bluetooth.c
@@ -80,7 +80,9 @@ static int toshiba_bluetooth_present(acpi_handle handle)
 	if (ACPI_FAILURE(result)) {
 		pr_err("ACPI call to query Bluetooth presence failed\n");
 		return -ENXIO;
-	} else if (!bt_present) {
+	}
+
+	if (!bt_present) {
 		pr_info("Bluetooth device not present\n");
 		return -ENODEV;
 	}
diff --git a/drivers/platform/x86/toshiba_haps.c b/drivers/platform/x86/toshiba_haps.c
index 7f2afc6b5eb9..b3dec521e2b6 100644
--- a/drivers/platform/x86/toshiba_haps.c
+++ b/drivers/platform/x86/toshiba_haps.c
@@ -59,7 +59,7 @@ static int toshiba_haps_protection_level(acpi_handle handle, int level)
 		return -EIO;
 	}
 
-	pr_info("HDD protection level set to: %d\n", level);
+	pr_debug("HDD protection level set to: %d\n", level);
 
 	return 0;
 }
@@ -141,7 +141,7 @@ static struct attribute_group haps_attr_group = {
  */
 static void toshiba_haps_notify(struct acpi_device *device, u32 event)
 {
-	pr_info("Received event: 0x%x", event);
+	pr_debug("Received event: 0x%x", event);
 
 	acpi_bus_generate_netlink_event(device->pnp.device_class,
 					dev_name(&device->dev),
@@ -168,9 +168,13 @@ static int toshiba_haps_available(acpi_handle handle)
 	 * A non existent device as well as having (only)
 	 * Solid State Drives can cause the call to fail.
 	 */
-	status = acpi_evaluate_integer(handle, "_STA", NULL,
-				       &hdd_present);
-	if (ACPI_FAILURE(status) || !hdd_present) {
+	status = acpi_evaluate_integer(handle, "_STA", NULL, &hdd_present);
+	if (ACPI_FAILURE(status)) {
+		pr_err("ACPI call to query HDD protection failed\n");
+		return 0;
+	}
+
+	if (!hdd_present) {
 		pr_info("HDD protection not available or using SSD\n");
 		return 0;
 	}
diff --git a/drivers/ptp/ptp_chardev.c b/drivers/ptp/ptp_chardev.c
index d637c933c8a9..58a97d420572 100644
--- a/drivers/ptp/ptp_chardev.c
+++ b/drivers/ptp/ptp_chardev.c
@@ -193,6 +193,7 @@ long ptp_ioctl(struct posix_clock *pc, unsigned int cmd, unsigned long arg)
 		if (err)
 			break;
 
+		memset(&precise_offset, 0, sizeof(precise_offset));
 		ts = ktime_to_timespec64(xtstamp.device);
 		precise_offset.device.sec = ts.tv_sec;
 		precise_offset.device.nsec = ts.tv_nsec;
diff --git a/drivers/pwm/Kconfig b/drivers/pwm/Kconfig
index 80a566a00d04..bf0128899c09 100644
--- a/drivers/pwm/Kconfig
+++ b/drivers/pwm/Kconfig
@@ -262,6 +262,15 @@ config PWM_LPSS_PLATFORM
 	  To compile this driver as a module, choose M here: the module
 	  will be called pwm-lpss-platform.
 
+config PWM_MESON
+	tristate "Amlogic Meson PWM driver"
+	depends on ARCH_MESON
+	help
+	  The platform driver for Amlogic Meson PWM controller.
+
+	  To compile this driver as a module, choose M here: the module
+	  will be called pwm-meson.
+
 config PWM_MTK_DISP
 	tristate "MediaTek display PWM driver"
 	depends on ARCH_MEDIATEK || COMPILE_TEST
diff --git a/drivers/pwm/Makefile b/drivers/pwm/Makefile
index feef1dd29f73..1194c54efcc2 100644
--- a/drivers/pwm/Makefile
+++ b/drivers/pwm/Makefile
@@ -24,6 +24,7 @@ obj-$(CONFIG_PWM_LPC32XX)	+= pwm-lpc32xx.o
 obj-$(CONFIG_PWM_LPSS)		+= pwm-lpss.o
 obj-$(CONFIG_PWM_LPSS_PCI)	+= pwm-lpss-pci.o
 obj-$(CONFIG_PWM_LPSS_PLATFORM)	+= pwm-lpss-platform.o
+obj-$(CONFIG_PWM_MESON)		+= pwm-meson.o
 obj-$(CONFIG_PWM_MTK_DISP)	+= pwm-mtk-disp.o
 obj-$(CONFIG_PWM_MXS)		+= pwm-mxs.o
 obj-$(CONFIG_PWM_OMAP_DMTIMER)	+= pwm-omap-dmtimer.o
diff --git a/drivers/pwm/core.c b/drivers/pwm/core.c
index 0dbd29e287db..172ef8245811 100644
--- a/drivers/pwm/core.c
+++ b/drivers/pwm/core.c
@@ -339,6 +339,8 @@ int pwmchip_remove(struct pwm_chip *chip)
 	unsigned int i;
 	int ret = 0;
 
+	pwmchip_sysfs_unexport_children(chip);
+
 	mutex_lock(&pwm_lock);
 
 	for (i = 0; i < chip->npwm; i++) {
diff --git a/drivers/pwm/pwm-berlin.c b/drivers/pwm/pwm-berlin.c
index 65108129d505..01339c152ab0 100644
--- a/drivers/pwm/pwm-berlin.c
+++ b/drivers/pwm/pwm-berlin.c
@@ -16,6 +16,7 @@
 #include <linux/module.h>
 #include <linux/platform_device.h>
 #include <linux/pwm.h>
+#include <linux/slab.h>
 
 #define BERLIN_PWM_EN			0x0
 #define  BERLIN_PWM_ENABLE		BIT(0)
@@ -27,6 +28,13 @@
 #define BERLIN_PWM_TCNT			0xc
 #define  BERLIN_PWM_MAX_TCNT		65535
 
+struct berlin_pwm_channel {
+	u32 enable;
+	u32 ctrl;
+	u32 duty;
+	u32 tcnt;
+};
+
 struct berlin_pwm_chip {
 	struct pwm_chip chip;
 	struct clk *clk;
@@ -55,6 +63,25 @@ static inline void berlin_pwm_writel(struct berlin_pwm_chip *chip,
 	writel_relaxed(value, chip->base + channel * 0x10 + offset);
 }
 
+static int berlin_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+	struct berlin_pwm_channel *channel;
+
+	channel = kzalloc(sizeof(*channel), GFP_KERNEL);
+	if (!channel)
+		return -ENOMEM;
+
+	return pwm_set_chip_data(pwm, channel);
+}
+
+static void berlin_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+	struct berlin_pwm_channel *channel = pwm_get_chip_data(pwm);
+
+	pwm_set_chip_data(pwm, NULL);
+	kfree(channel);
+}
+
 static int berlin_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm_dev,
 			     int duty_ns, int period_ns)
 {
@@ -137,6 +164,8 @@ static void berlin_pwm_disable(struct pwm_chip *chip,
 }
 
 static const struct pwm_ops berlin_pwm_ops = {
+	.request = berlin_pwm_request,
+	.free = berlin_pwm_free,
 	.config = berlin_pwm_config,
 	.set_polarity = berlin_pwm_set_polarity,
 	.enable = berlin_pwm_enable,
@@ -204,12 +233,67 @@ static int berlin_pwm_remove(struct platform_device *pdev)
 	return ret;
 }
 
+#ifdef CONFIG_PM_SLEEP
+static int berlin_pwm_suspend(struct device *dev)
+{
+	struct berlin_pwm_chip *pwm = dev_get_drvdata(dev);
+	unsigned int i;
+
+	for (i = 0; i < pwm->chip.npwm; i++) {
+		struct berlin_pwm_channel *channel;
+
+		channel = pwm_get_chip_data(&pwm->chip.pwms[i]);
+		if (!channel)
+			continue;
+
+		channel->enable = berlin_pwm_readl(pwm, i, BERLIN_PWM_ENABLE);
+		channel->ctrl = berlin_pwm_readl(pwm, i, BERLIN_PWM_CONTROL);
+		channel->duty = berlin_pwm_readl(pwm, i, BERLIN_PWM_DUTY);
+		channel->tcnt = berlin_pwm_readl(pwm, i, BERLIN_PWM_TCNT);
+	}
+
+	clk_disable_unprepare(pwm->clk);
+
+	return 0;
+}
+
+static int berlin_pwm_resume(struct device *dev)
+{
+	struct berlin_pwm_chip *pwm = dev_get_drvdata(dev);
+	unsigned int i;
+	int ret;
+
+	ret = clk_prepare_enable(pwm->clk);
+	if (ret)
+		return ret;
+
+	for (i = 0; i < pwm->chip.npwm; i++) {
+		struct berlin_pwm_channel *channel;
+
+		channel = pwm_get_chip_data(&pwm->chip.pwms[i]);
+		if (!channel)
+			continue;
+
+		berlin_pwm_writel(pwm, i, channel->ctrl, BERLIN_PWM_CONTROL);
+		berlin_pwm_writel(pwm, i, channel->duty, BERLIN_PWM_DUTY);
+		berlin_pwm_writel(pwm, i, channel->tcnt, BERLIN_PWM_TCNT);
+		berlin_pwm_writel(pwm, i, channel->enable, BERLIN_PWM_ENABLE);
+	}
+
+	return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(berlin_pwm_pm_ops, berlin_pwm_suspend,
+			 berlin_pwm_resume);
+
 static struct platform_driver berlin_pwm_driver = {
 	.probe = berlin_pwm_probe,
 	.remove = berlin_pwm_remove,
 	.driver = {
 		.name = "berlin-pwm",
 		.of_match_table = berlin_pwm_match,
+		.pm = &berlin_pwm_pm_ops,
 	},
 };
 module_platform_driver(berlin_pwm_driver);
diff --git a/drivers/pwm/pwm-cros-ec.c b/drivers/pwm/pwm-cros-ec.c
index 99b9acc1a420..f6ca4e8c6253 100644
--- a/drivers/pwm/pwm-cros-ec.c
+++ b/drivers/pwm/pwm-cros-ec.c
@@ -38,7 +38,7 @@ static int cros_ec_pwm_set_duty(struct cros_ec_device *ec, u8 index, u16 duty)
 	struct {
 		struct cros_ec_command msg;
 		struct ec_params_pwm_set_duty params;
-	} buf;
+	} __packed buf;
 	struct ec_params_pwm_set_duty *params = &buf.params;
 	struct cros_ec_command *msg = &buf.msg;
 
@@ -65,7 +65,7 @@ static int __cros_ec_pwm_get_duty(struct cros_ec_device *ec, u8 index,
 			struct ec_params_pwm_get_duty params;
 			struct ec_response_pwm_get_duty resp;
 		};
-	} buf;
+	} __packed buf;
 	struct ec_params_pwm_get_duty *params = &buf.params;
 	struct ec_response_pwm_get_duty *resp = &buf.resp;
 	struct cros_ec_command *msg = &buf.msg;
diff --git a/drivers/pwm/pwm-lpc18xx-sct.c b/drivers/pwm/pwm-lpc18xx-sct.c
index 19dc64cab2f0..d7f5f7de030d 100644
--- a/drivers/pwm/pwm-lpc18xx-sct.c
+++ b/drivers/pwm/pwm-lpc18xx-sct.c
@@ -413,14 +413,18 @@ static int lpc18xx_pwm_probe(struct platform_device *pdev)
 	}
 
 	for (i = 0; i < lpc18xx_pwm->chip.npwm; i++) {
+		struct lpc18xx_pwm_data *data;
+
 		pwm = &lpc18xx_pwm->chip.pwms[i];
-		pwm->chip_data = devm_kzalloc(lpc18xx_pwm->dev,
-					      sizeof(struct lpc18xx_pwm_data),
-					      GFP_KERNEL);
-		if (!pwm->chip_data) {
+
+		data = devm_kzalloc(lpc18xx_pwm->dev, sizeof(*data),
+				    GFP_KERNEL);
+		if (!data) {
 			ret = -ENOMEM;
 			goto remove_pwmchip;
 		}
+
+		pwm_set_chip_data(pwm, data);
 	}
 
 	platform_set_drvdata(pdev, lpc18xx_pwm);
diff --git a/drivers/pwm/pwm-meson.c b/drivers/pwm/pwm-meson.c
new file mode 100644
index 000000000000..381871b2bb46
--- /dev/null
+++ b/drivers/pwm/pwm-meson.c
@@ -0,0 +1,529 @@
+/*
+ * This file is provided under a dual BSD/GPLv2 license.  When using or
+ * redistributing this file, you may do so under either license.
+ *
+ * GPL LICENSE SUMMARY
+ *
+ * Copyright (c) 2016 BayLibre, SAS.
+ * Author: Neil Armstrong <narmstrong@baylibre.com>
+ * Copyright (C) 2014 Amlogic, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of version 2 of the GNU General Public License as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ * The full GNU General Public License is included in this distribution
+ * in the file called COPYING.
+ *
+ * BSD LICENSE
+ *
+ * Copyright (c) 2016 BayLibre, SAS.
+ * Author: Neil Armstrong <narmstrong@baylibre.com>
+ * Copyright (C) 2014 Amlogic, Inc.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ *
+ *   * Redistributions of source code must retain the above copyright
+ *     notice, this list of conditions and the following disclaimer.
+ *   * Redistributions in binary form must reproduce the above copyright
+ *     notice, this list of conditions and the following disclaimer in
+ *     the documentation and/or other materials provided with the
+ *     distribution.
+ *   * Neither the name of Intel Corporation nor the names of its
+ *     contributors may be used to endorse or promote products derived
+ *     from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <linux/clk.h>
+#include <linux/clk-provider.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/of_device.h>
+#include <linux/platform_device.h>
+#include <linux/pwm.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+
+#define REG_PWM_A		0x0
+#define REG_PWM_B		0x4
+#define PWM_HIGH_SHIFT		16
+
+#define REG_MISC_AB		0x8
+#define MISC_B_CLK_EN		BIT(23)
+#define MISC_A_CLK_EN		BIT(15)
+#define MISC_CLK_DIV_MASK	0x7f
+#define MISC_B_CLK_DIV_SHIFT	16
+#define MISC_A_CLK_DIV_SHIFT	8
+#define MISC_B_CLK_SEL_SHIFT	6
+#define MISC_A_CLK_SEL_SHIFT	4
+#define MISC_CLK_SEL_WIDTH	2
+#define MISC_B_EN		BIT(1)
+#define MISC_A_EN		BIT(0)
+
+static const unsigned int mux_reg_shifts[] = {
+	MISC_A_CLK_SEL_SHIFT,
+	MISC_B_CLK_SEL_SHIFT
+};
+
+struct meson_pwm_channel {
+	unsigned int hi;
+	unsigned int lo;
+	u8 pre_div;
+
+	struct pwm_state state;
+
+	struct clk *clk_parent;
+	struct clk_mux mux;
+	struct clk *clk;
+};
+
+struct meson_pwm_data {
+	const char * const *parent_names;
+};
+
+struct meson_pwm {
+	struct pwm_chip chip;
+	const struct meson_pwm_data *data;
+	void __iomem *base;
+	u8 inverter_mask;
+	spinlock_t lock;
+};
+
+static inline struct meson_pwm *to_meson_pwm(struct pwm_chip *chip)
+{
+	return container_of(chip, struct meson_pwm, chip);
+}
+
+static int meson_pwm_request(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+	struct meson_pwm_channel *channel = pwm_get_chip_data(pwm);
+	struct device *dev = chip->dev;
+	int err;
+
+	if (!channel)
+		return -ENODEV;
+
+	if (channel->clk_parent) {
+		err = clk_set_parent(channel->clk, channel->clk_parent);
+		if (err < 0) {
+			dev_err(dev, "failed to set parent %s for %s: %d\n",
+				__clk_get_name(channel->clk_parent),
+				__clk_get_name(channel->clk), err);
+				return err;
+		}
+	}
+
+	err = clk_prepare_enable(channel->clk);
+	if (err < 0) {
+		dev_err(dev, "failed to enable clock %s: %d\n",
+			__clk_get_name(channel->clk), err);
+		return err;
+	}
+
+	chip->ops->get_state(chip, pwm, &channel->state);
+
+	return 0;
+}
+
+static void meson_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
+{
+	struct meson_pwm_channel *channel = pwm_get_chip_data(pwm);
+
+	if (channel)
+		clk_disable_unprepare(channel->clk);
+}
+
+static int meson_pwm_calc(struct meson_pwm *meson,
+			  struct meson_pwm_channel *channel, unsigned int id,
+			  unsigned int duty, unsigned int period)
+{
+	unsigned int pre_div, cnt, duty_cnt;
+	unsigned long fin_freq = -1, fin_ns;
+
+	if (~(meson->inverter_mask >> id) & 0x1)
+		duty = period - duty;
+
+	if (period == channel->state.period &&
+	    duty == channel->state.duty_cycle)
+		return 0;
+
+	fin_freq = clk_get_rate(channel->clk);
+	if (fin_freq == 0) {
+		dev_err(meson->chip.dev, "invalid source clock frequency\n");
+		return -EINVAL;
+	}
+
+	dev_dbg(meson->chip.dev, "fin_freq: %lu Hz\n", fin_freq);
+	fin_ns = NSEC_PER_SEC / fin_freq;
+
+	/* Calc pre_div with the period */
+	for (pre_div = 0; pre_div < MISC_CLK_DIV_MASK; pre_div++) {
+		cnt = DIV_ROUND_CLOSEST(period, fin_ns * (pre_div + 1));
+		dev_dbg(meson->chip.dev, "fin_ns=%lu pre_div=%u cnt=%u\n",
+			fin_ns, pre_div, cnt);
+		if (cnt <= 0xffff)
+			break;
+	}
+
+	if (pre_div == MISC_CLK_DIV_MASK) {
+		dev_err(meson->chip.dev, "unable to get period pre_div\n");
+		return -EINVAL;
+	}
+
+	dev_dbg(meson->chip.dev, "period=%u pre_div=%u cnt=%u\n", period,
+		pre_div, cnt);
+
+	if (duty == period) {
+		channel->pre_div = pre_div;
+		channel->hi = cnt;
+		channel->lo = 0;
+	} else if (duty == 0) {
+		channel->pre_div = pre_div;
+		channel->hi = 0;
+		channel->lo = cnt;
+	} else {
+		/* Then check is we can have the duty with the same pre_div */
+		duty_cnt = DIV_ROUND_CLOSEST(duty, fin_ns * (pre_div + 1));
+		if (duty_cnt > 0xffff) {
+			dev_err(meson->chip.dev, "unable to get duty cycle\n");
+			return -EINVAL;
+		}
+
+		dev_dbg(meson->chip.dev, "duty=%u pre_div=%u duty_cnt=%u\n",
+			duty, pre_div, duty_cnt);
+
+		channel->pre_div = pre_div;
+		channel->hi = duty_cnt;
+		channel->lo = cnt - duty_cnt;
+	}
+
+	return 0;
+}
+
+static void meson_pwm_enable(struct meson_pwm *meson,
+			     struct meson_pwm_channel *channel,
+			     unsigned int id)
+{
+	u32 value, clk_shift, clk_enable, enable;
+	unsigned int offset;
+
+	switch (id) {
+	case 0:
+		clk_shift = MISC_A_CLK_DIV_SHIFT;
+		clk_enable = MISC_A_CLK_EN;
+		enable = MISC_A_EN;
+		offset = REG_PWM_A;
+		break;
+
+	case 1:
+		clk_shift = MISC_B_CLK_DIV_SHIFT;
+		clk_enable = MISC_B_CLK_EN;
+		enable = MISC_B_EN;
+		offset = REG_PWM_B;
+		break;
+
+	default:
+		return;
+	}
+
+	value = readl(meson->base + REG_MISC_AB);
+	value &= ~(MISC_CLK_DIV_MASK << clk_shift);
+	value |= channel->pre_div << clk_shift;
+	value |= clk_enable;
+	writel(value, meson->base + REG_MISC_AB);
+
+	value = (channel->hi << PWM_HIGH_SHIFT) | channel->lo;
+	writel(value, meson->base + offset);
+
+	value = readl(meson->base + REG_MISC_AB);
+	value |= enable;
+	writel(value, meson->base + REG_MISC_AB);
+}
+
+static void meson_pwm_disable(struct meson_pwm *meson, unsigned int id)
+{
+	u32 value, enable;
+
+	switch (id) {
+	case 0:
+		enable = MISC_A_EN;
+		break;
+
+	case 1:
+		enable = MISC_B_EN;
+		break;
+
+	default:
+		return;
+	}
+
+	value = readl(meson->base + REG_MISC_AB);
+	value &= ~enable;
+	writel(value, meson->base + REG_MISC_AB);
+}
+
+static int meson_pwm_apply(struct pwm_chip *chip, struct pwm_device *pwm,
+			   struct pwm_state *state)
+{
+	struct meson_pwm_channel *channel = pwm_get_chip_data(pwm);
+	struct meson_pwm *meson = to_meson_pwm(chip);
+	unsigned long flags;
+	int err = 0;
+
+	if (!state)
+		return -EINVAL;
+
+	spin_lock_irqsave(&meson->lock, flags);
+
+	if (!state->enabled) {
+		meson_pwm_disable(meson, pwm->hwpwm);
+		channel->state.enabled = false;
+
+		goto unlock;
+	}
+
+	if (state->period != channel->state.period ||
+	    state->duty_cycle != channel->state.duty_cycle ||
+	    state->polarity != channel->state.polarity) {
+		if (channel->state.enabled) {
+			meson_pwm_disable(meson, pwm->hwpwm);
+			channel->state.enabled = false;
+		}
+
+		if (state->polarity != channel->state.polarity) {
+			if (state->polarity == PWM_POLARITY_NORMAL)
+				meson->inverter_mask |= BIT(pwm->hwpwm);
+			else
+				meson->inverter_mask &= ~BIT(pwm->hwpwm);
+		}
+
+		err = meson_pwm_calc(meson, channel, pwm->hwpwm,
+				     state->duty_cycle, state->period);
+		if (err < 0)
+			goto unlock;
+
+		channel->state.polarity = state->polarity;
+		channel->state.period = state->period;
+		channel->state.duty_cycle = state->duty_cycle;
+	}
+
+	if (state->enabled && !channel->state.enabled) {
+		meson_pwm_enable(meson, channel, pwm->hwpwm);
+		channel->state.enabled = true;
+	}
+
+unlock:
+	spin_unlock_irqrestore(&meson->lock, flags);
+	return err;
+}
+
+static void meson_pwm_get_state(struct pwm_chip *chip, struct pwm_device *pwm,
+				struct pwm_state *state)
+{
+	struct meson_pwm *meson = to_meson_pwm(chip);
+	u32 value, mask;
+
+	if (!state)
+		return;
+
+	switch (pwm->hwpwm) {
+	case 0:
+		mask = MISC_A_EN;
+		break;
+
+	case 1:
+		mask = MISC_B_EN;
+		break;
+
+	default:
+		return;
+	}
+
+	value = readl(meson->base + REG_MISC_AB);
+	state->enabled = (value & mask) != 0;
+}
+
+static const struct pwm_ops meson_pwm_ops = {
+	.request = meson_pwm_request,
+	.free = meson_pwm_free,
+	.apply = meson_pwm_apply,
+	.get_state = meson_pwm_get_state,
+	.owner = THIS_MODULE,
+};
+
+static const char * const pwm_meson8b_parent_names[] = {
+	"xtal", "vid_pll", "fclk_div4", "fclk_div3"
+};
+
+static const struct meson_pwm_data pwm_meson8b_data = {
+	.parent_names = pwm_meson8b_parent_names,
+};
+
+static const char * const pwm_gxbb_parent_names[] = {
+	"xtal", "hdmi_pll", "fclk_div4", "fclk_div3"
+};
+
+static const struct meson_pwm_data pwm_gxbb_data = {
+	.parent_names = pwm_gxbb_parent_names,
+};
+
+static const struct of_device_id meson_pwm_matches[] = {
+	{ .compatible = "amlogic,meson8b-pwm", .data = &pwm_meson8b_data },
+	{ .compatible = "amlogic,meson-gxbb-pwm", .data = &pwm_gxbb_data },
+	{},
+};
+MODULE_DEVICE_TABLE(of, meson_pwm_matches);
+
+static int meson_pwm_init_channels(struct meson_pwm *meson,
+				   struct meson_pwm_channel *channels)
+{
+	struct device *dev = meson->chip.dev;
+	struct device_node *np = dev->of_node;
+	struct clk_init_data init;
+	unsigned int i;
+	char name[255];
+	int err;
+
+	for (i = 0; i < meson->chip.npwm; i++) {
+		struct meson_pwm_channel *channel = &channels[i];
+
+		snprintf(name, sizeof(name), "%s#mux%u", np->full_name, i);
+
+		init.name = name;
+		init.ops = &clk_mux_ops;
+		init.flags = CLK_IS_BASIC;
+		init.parent_names = meson->data->parent_names;
+		init.num_parents = 1 << MISC_CLK_SEL_WIDTH;
+
+		channel->mux.reg = meson->base + REG_MISC_AB;
+		channel->mux.shift = mux_reg_shifts[i];
+		channel->mux.mask = BIT(MISC_CLK_SEL_WIDTH) - 1;
+		channel->mux.flags = 0;
+		channel->mux.lock = &meson->lock;
+		channel->mux.table = NULL;
+		channel->mux.hw.init = &init;
+
+		channel->clk = devm_clk_register(dev, &channel->mux.hw);
+		if (IS_ERR(channel->clk)) {
+			err = PTR_ERR(channel->clk);
+			dev_err(dev, "failed to register %s: %d\n", name, err);
+			return err;
+		}
+
+		snprintf(name, sizeof(name), "clkin%u", i);
+
+		channel->clk_parent = devm_clk_get(dev, name);
+		if (IS_ERR(channel->clk_parent)) {
+			err = PTR_ERR(channel->clk_parent);
+			if (err == -EPROBE_DEFER)
+				return err;
+
+			channel->clk_parent = NULL;
+		}
+	}
+
+	return 0;
+}
+
+static void meson_pwm_add_channels(struct meson_pwm *meson,
+				   struct meson_pwm_channel *channels)
+{
+	unsigned int i;
+
+	for (i = 0; i < meson->chip.npwm; i++)
+		pwm_set_chip_data(&meson->chip.pwms[i], &channels[i]);
+}
+
+static int meson_pwm_probe(struct platform_device *pdev)
+{
+	struct meson_pwm_channel *channels;
+	struct meson_pwm *meson;
+	struct resource *regs;
+	int err;
+
+	meson = devm_kzalloc(&pdev->dev, sizeof(*meson), GFP_KERNEL);
+	if (!meson)
+		return -ENOMEM;
+
+	regs = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	meson->base = devm_ioremap_resource(&pdev->dev, regs);
+	if (IS_ERR(meson->base))
+		return PTR_ERR(meson->base);
+
+	meson->chip.dev = &pdev->dev;
+	meson->chip.ops = &meson_pwm_ops;
+	meson->chip.base = -1;
+	meson->chip.npwm = 2;
+	meson->chip.of_xlate = of_pwm_xlate_with_flags;
+	meson->chip.of_pwm_n_cells = 3;
+
+	meson->data = of_device_get_match_data(&pdev->dev);
+	meson->inverter_mask = BIT(meson->chip.npwm) - 1;
+
+	channels = devm_kcalloc(&pdev->dev, meson->chip.npwm, sizeof(*meson),
+				GFP_KERNEL);
+	if (!channels)
+		return -ENOMEM;
+
+	err = meson_pwm_init_channels(meson, channels);
+	if (err < 0)
+		return err;
+
+	err = pwmchip_add(&meson->chip);
+	if (err < 0) {
+		dev_err(&pdev->dev, "failed to register PWM chip: %d\n", err);
+		return err;
+	}
+
+	meson_pwm_add_channels(meson, channels);
+
+	platform_set_drvdata(pdev, meson);
+
+	return 0;
+}
+
+static int meson_pwm_remove(struct platform_device *pdev)
+{
+	struct meson_pwm *meson = platform_get_drvdata(pdev);
+
+	return pwmchip_remove(&meson->chip);
+}
+
+static struct platform_driver meson_pwm_driver = {
+	.driver = {
+		.name = "meson-pwm",
+		.of_match_table = meson_pwm_matches,
+	},
+	.probe = meson_pwm_probe,
+	.remove = meson_pwm_remove,
+};
+module_platform_driver(meson_pwm_driver);
+
+MODULE_ALIAS("platform:meson-pwm");
+MODULE_DESCRIPTION("Amlogic Meson PWM Generator driver");
+MODULE_AUTHOR("Neil Armstrong <narmstrong@baylibre.com>");
+MODULE_LICENSE("Dual BSD/GPL");
diff --git a/drivers/pwm/pwm-mtk-disp.c b/drivers/pwm/pwm-mtk-disp.c
index 0ad3385298c0..893940d45f0d 100644
--- a/drivers/pwm/pwm-mtk-disp.c
+++ b/drivers/pwm/pwm-mtk-disp.c
@@ -18,30 +18,40 @@
 #include <linux/io.h>
 #include <linux/module.h>
 #include <linux/of.h>
+#include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/pwm.h>
 #include <linux/slab.h>
 
 #define DISP_PWM_EN		0x00
-#define PWM_ENABLE_MASK		BIT(0)
 
-#define DISP_PWM_COMMIT		0x08
-#define PWM_COMMIT_MASK		BIT(0)
-
-#define DISP_PWM_CON_0		0x10
 #define PWM_CLKDIV_SHIFT	16
 #define PWM_CLKDIV_MAX		0x3ff
 #define PWM_CLKDIV_MASK		(PWM_CLKDIV_MAX << PWM_CLKDIV_SHIFT)
 
-#define DISP_PWM_CON_1		0x14
 #define PWM_PERIOD_BIT_WIDTH	12
 #define PWM_PERIOD_MASK		((1 << PWM_PERIOD_BIT_WIDTH) - 1)
 
 #define PWM_HIGH_WIDTH_SHIFT	16
 #define PWM_HIGH_WIDTH_MASK	(0x1fff << PWM_HIGH_WIDTH_SHIFT)
 
+struct mtk_pwm_data {
+	u32 enable_mask;
+	unsigned int con0;
+	u32 con0_sel;
+	unsigned int con1;
+
+	bool has_commit;
+	unsigned int commit;
+	unsigned int commit_mask;
+
+	unsigned int bls_debug;
+	u32 bls_debug_mask;
+};
+
 struct mtk_disp_pwm {
 	struct pwm_chip chip;
+	const struct mtk_pwm_data *data;
 	struct clk *clk_main;
 	struct clk *clk_mm;
 	void __iomem *base;
@@ -106,12 +116,21 @@ static int mtk_disp_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 		return err;
 	}
 
-	mtk_disp_pwm_update_bits(mdp, DISP_PWM_CON_0, PWM_CLKDIV_MASK,
+	mtk_disp_pwm_update_bits(mdp, mdp->data->con0,
+				 PWM_CLKDIV_MASK,
 				 clk_div << PWM_CLKDIV_SHIFT);
-	mtk_disp_pwm_update_bits(mdp, DISP_PWM_CON_1,
-				 PWM_PERIOD_MASK | PWM_HIGH_WIDTH_MASK, value);
-	mtk_disp_pwm_update_bits(mdp, DISP_PWM_COMMIT, PWM_COMMIT_MASK, 1);
-	mtk_disp_pwm_update_bits(mdp, DISP_PWM_COMMIT, PWM_COMMIT_MASK, 0);
+	mtk_disp_pwm_update_bits(mdp, mdp->data->con1,
+				 PWM_PERIOD_MASK | PWM_HIGH_WIDTH_MASK,
+				 value);
+
+	if (mdp->data->has_commit) {
+		mtk_disp_pwm_update_bits(mdp, mdp->data->commit,
+					 mdp->data->commit_mask,
+					 mdp->data->commit_mask);
+		mtk_disp_pwm_update_bits(mdp, mdp->data->commit,
+					 mdp->data->commit_mask,
+					 0x0);
+	}
 
 	clk_disable(mdp->clk_mm);
 	clk_disable(mdp->clk_main);
@@ -134,7 +153,8 @@ static int mtk_disp_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 		return err;
 	}
 
-	mtk_disp_pwm_update_bits(mdp, DISP_PWM_EN, PWM_ENABLE_MASK, 1);
+	mtk_disp_pwm_update_bits(mdp, DISP_PWM_EN, mdp->data->enable_mask,
+				 mdp->data->enable_mask);
 
 	return 0;
 }
@@ -143,7 +163,8 @@ static void mtk_disp_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
 {
 	struct mtk_disp_pwm *mdp = to_mtk_disp_pwm(chip);
 
-	mtk_disp_pwm_update_bits(mdp, DISP_PWM_EN, PWM_ENABLE_MASK, 0);
+	mtk_disp_pwm_update_bits(mdp, DISP_PWM_EN, mdp->data->enable_mask,
+				 0x0);
 
 	clk_disable(mdp->clk_mm);
 	clk_disable(mdp->clk_main);
@@ -166,6 +187,8 @@ static int mtk_disp_pwm_probe(struct platform_device *pdev)
 	if (!mdp)
 		return -ENOMEM;
 
+	mdp->data = of_device_get_match_data(&pdev->dev);
+
 	r = platform_get_resource(pdev, IORESOURCE_MEM, 0);
 	mdp->base = devm_ioremap_resource(&pdev->dev, r);
 	if (IS_ERR(mdp->base))
@@ -200,6 +223,19 @@ static int mtk_disp_pwm_probe(struct platform_device *pdev)
 
 	platform_set_drvdata(pdev, mdp);
 
+	/*
+	 * For MT2701, disable double buffer before writing register
+	 * and select manual mode and use PWM_PERIOD/PWM_HIGH_WIDTH.
+	 */
+	if (!mdp->data->has_commit) {
+		mtk_disp_pwm_update_bits(mdp, mdp->data->bls_debug,
+					 mdp->data->bls_debug_mask,
+					 mdp->data->bls_debug_mask);
+		mtk_disp_pwm_update_bits(mdp, mdp->data->con0,
+					 mdp->data->con0_sel,
+					 mdp->data->con0_sel);
+	}
+
 	return 0;
 
 disable_clk_mm:
@@ -221,9 +257,30 @@ static int mtk_disp_pwm_remove(struct platform_device *pdev)
 	return ret;
 }
 
+static const struct mtk_pwm_data mt2701_pwm_data = {
+	.enable_mask = BIT(16),
+	.con0 = 0xa8,
+	.con0_sel = 0x2,
+	.con1 = 0xac,
+	.has_commit = false,
+	.bls_debug = 0xb0,
+	.bls_debug_mask = 0x3,
+};
+
+static const struct mtk_pwm_data mt8173_pwm_data = {
+	.enable_mask = BIT(0),
+	.con0 = 0x10,
+	.con0_sel = 0x0,
+	.con1 = 0x14,
+	.has_commit = true,
+	.commit = 0x8,
+	.commit_mask = 0x1,
+};
+
 static const struct of_device_id mtk_disp_pwm_of_match[] = {
-	{ .compatible = "mediatek,mt8173-disp-pwm" },
-	{ .compatible = "mediatek,mt6595-disp-pwm" },
+	{ .compatible = "mediatek,mt2701-disp-pwm", .data = &mt2701_pwm_data},
+	{ .compatible = "mediatek,mt6595-disp-pwm", .data = &mt8173_pwm_data},
+	{ .compatible = "mediatek,mt8173-disp-pwm", .data = &mt8173_pwm_data},
 	{ }
 };
 MODULE_DEVICE_TABLE(of, mtk_disp_pwm_of_match);
diff --git a/drivers/pwm/pwm-samsung.c b/drivers/pwm/pwm-samsung.c
index ada2d326dc3e..f113cda47032 100644
--- a/drivers/pwm/pwm-samsung.c
+++ b/drivers/pwm/pwm-samsung.c
@@ -193,9 +193,18 @@ static unsigned long pwm_samsung_calc_tin(struct samsung_pwm_chip *chip,
 	 * divider settings and choose the lowest divisor that can generate
 	 * frequencies lower than requested.
 	 */
-	for (div = variant->div_base; div < 4; ++div)
-		if ((rate >> (variant->bits + div)) < freq)
-			break;
+	if (variant->bits < 32) {
+		/* Only for s3c24xx */
+		for (div = variant->div_base; div < 4; ++div)
+			if ((rate >> (variant->bits + div)) < freq)
+				break;
+	} else {
+		/*
+		 * Other variants have enough counter bits to generate any
+		 * requested rate, so no need to check higher divisors.
+		 */
+		div = variant->div_base;
+	}
 
 	pwm_samsung_set_divisor(chip, chan, BIT(div));
 
diff --git a/drivers/pwm/pwm-sti.c b/drivers/pwm/pwm-sti.c
index 92abbd56b9f7..dd82dc840af9 100644
--- a/drivers/pwm/pwm-sti.c
+++ b/drivers/pwm/pwm-sti.c
@@ -1,8 +1,10 @@
 /*
- * PWM device driver for ST SoCs.
- * Author: Ajit Pal Singh <ajitpal.singh@st.com>
+ * PWM device driver for ST SoCs
+ *
+ * Copyright (C) 2013-2016 STMicroelectronics (R&D) Limited
  *
- * Copyright (C) 2013-2014 STMicroelectronics (R&D) Limited
+ * Author: Ajit Pal Singh <ajitpal.singh@st.com>
+ *         Lee Jones <lee.jones@linaro.org>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License as published by
@@ -11,6 +13,7 @@
  */
 
 #include <linux/clk.h>
+#include <linux/interrupt.h>
 #include <linux/math64.h>
 #include <linux/mfd/syscon.h>
 #include <linux/module.h>
@@ -18,43 +21,82 @@
 #include <linux/platform_device.h>
 #include <linux/pwm.h>
 #include <linux/regmap.h>
+#include <linux/sched.h>
 #include <linux/slab.h>
 #include <linux/time.h>
+#include <linux/wait.h>
+
+#define PWM_OUT_VAL(x)	(0x00 + (4 * (x))) /* Device's Duty Cycle register */
+#define PWM_CPT_VAL(x)	(0x10 + (4 * (x))) /* Capture value */
+#define PWM_CPT_EDGE(x) (0x30 + (4 * (x))) /* Edge to capture on */
 
-#define STI_DS_REG(ch)	(4 * (ch))	/* Channel's Duty Cycle register */
-#define STI_PWMCR	0x50		/* Control/Config register */
-#define STI_INTEN	0x54		/* Interrupt Enable/Disable register */
-#define PWM_PRESCALE_LOW_MASK		0x0f
-#define PWM_PRESCALE_HIGH_MASK		0xf0
+#define STI_PWM_CTRL		0x50	/* Control/Config register */
+#define STI_INT_EN		0x54	/* Interrupt Enable/Disable register */
+#define STI_INT_STA		0x58	/* Interrupt Status register */
+#define PWM_INT_ACK		0x5c
+#define PWM_PRESCALE_LOW_MASK	0x0f
+#define PWM_PRESCALE_HIGH_MASK	0xf0
+#define PWM_CPT_EDGE_MASK	0x03
+#define PWM_INT_ACK_MASK	0x1ff
+
+#define STI_MAX_CPT_DEVS	4
+#define CPT_DC_MAX		0xff
 
 /* Regfield IDs */
 enum {
+	/* Bits in PWM_CTRL*/
 	PWMCLK_PRESCALE_LOW,
 	PWMCLK_PRESCALE_HIGH,
-	PWM_EN,
-	PWM_INT_EN,
+	CPTCLK_PRESCALE,
+
+	PWM_OUT_EN,
+	PWM_CPT_EN,
+
+	PWM_CPT_INT_EN,
+	PWM_CPT_INT_STAT,
 
 	/* Keep last */
 	MAX_REGFIELDS
 };
 
+/*
+ * Each capture input can be programmed to detect rising-edge, falling-edge,
+ * either edge or neither egde.
+ */
+enum sti_cpt_edge {
+	CPT_EDGE_DISABLED,
+	CPT_EDGE_RISING,
+	CPT_EDGE_FALLING,
+	CPT_EDGE_BOTH,
+};
+
+struct sti_cpt_ddata {
+	u32 snapshot[3];
+	unsigned int index;
+	struct mutex lock;
+	wait_queue_head_t wait;
+};
+
 struct sti_pwm_compat_data {
 	const struct reg_field *reg_fields;
-	unsigned int num_chan;
+	unsigned int pwm_num_devs;
+	unsigned int cpt_num_devs;
 	unsigned int max_pwm_cnt;
 	unsigned int max_prescale;
 };
 
 struct sti_pwm_chip {
 	struct device *dev;
-	struct clk *clk;
-	unsigned long clk_rate;
+	struct clk *pwm_clk;
+	struct clk *cpt_clk;
 	struct regmap *regmap;
 	struct sti_pwm_compat_data *cdata;
 	struct regmap_field *prescale_low;
 	struct regmap_field *prescale_high;
-	struct regmap_field *pwm_en;
-	struct regmap_field *pwm_int_en;
+	struct regmap_field *pwm_out_en;
+	struct regmap_field *pwm_cpt_en;
+	struct regmap_field *pwm_cpt_int_en;
+	struct regmap_field *pwm_cpt_int_stat;
 	struct pwm_chip chip;
 	struct pwm_device *cur;
 	unsigned long configured;
@@ -64,10 +106,13 @@ struct sti_pwm_chip {
 };
 
 static const struct reg_field sti_pwm_regfields[MAX_REGFIELDS] = {
-	[PWMCLK_PRESCALE_LOW]	= REG_FIELD(STI_PWMCR, 0, 3),
-	[PWMCLK_PRESCALE_HIGH]	= REG_FIELD(STI_PWMCR, 11, 14),
-	[PWM_EN]		= REG_FIELD(STI_PWMCR, 9, 9),
-	[PWM_INT_EN]		= REG_FIELD(STI_INTEN, 0, 0),
+	[PWMCLK_PRESCALE_LOW] = REG_FIELD(STI_PWM_CTRL, 0, 3),
+	[PWMCLK_PRESCALE_HIGH] = REG_FIELD(STI_PWM_CTRL, 11, 14),
+	[CPTCLK_PRESCALE] = REG_FIELD(STI_PWM_CTRL, 4, 8),
+	[PWM_OUT_EN] = REG_FIELD(STI_PWM_CTRL, 9, 9),
+	[PWM_CPT_EN] = REG_FIELD(STI_PWM_CTRL, 10, 10),
+	[PWM_CPT_INT_EN] = REG_FIELD(STI_INT_EN, 1, 4),
+	[PWM_CPT_INT_STAT] = REG_FIELD(STI_INT_STA, 1, 4),
 };
 
 static inline struct sti_pwm_chip *to_sti_pwmchip(struct pwm_chip *chip)
@@ -82,61 +127,68 @@ static int sti_pwm_get_prescale(struct sti_pwm_chip *pc, unsigned long period,
 				unsigned int *prescale)
 {
 	struct sti_pwm_compat_data *cdata = pc->cdata;
-	unsigned long val;
+	unsigned long clk_rate;
+	unsigned long value;
 	unsigned int ps;
 
+	clk_rate = clk_get_rate(pc->pwm_clk);
+	if (!clk_rate) {
+		dev_err(pc->dev, "failed to get clock rate\n");
+		return -EINVAL;
+	}
+
 	/*
-	 * prescale = ((period_ns * clk_rate) / (10^9 * (max_pwm_count + 1)) - 1
+	 * prescale = ((period_ns * clk_rate) / (10^9 * (max_pwm_cnt + 1)) - 1
 	 */
-	val = NSEC_PER_SEC / pc->clk_rate;
-	val *= cdata->max_pwm_cnt + 1;
+	value = NSEC_PER_SEC / clk_rate;
+	value *= cdata->max_pwm_cnt + 1;
 
-	if (period % val) {
+	if (period % value)
 		return -EINVAL;
-	} else {
-		ps  = period / val - 1;
-		if (ps > cdata->max_prescale)
-			return -EINVAL;
-	}
+
+	ps  = period / value - 1;
+	if (ps > cdata->max_prescale)
+		return -EINVAL;
+
 	*prescale = ps;
 
 	return 0;
 }
 
 /*
- * For STiH4xx PWM IP, the PWM period is fixed to 256 local clock cycles.
- * The only way to change the period (apart from changing the PWM input clock)
- * is to change the PWM clock prescaler.
- * The prescaler is of 8 bits, so 256 prescaler values and hence
- * 256 possible period values are supported (for a particular clock rate).
- * The requested period will be applied only if it matches one of these
- * 256 values.
+ * For STiH4xx PWM IP, the PWM period is fixed to 256 local clock cycles. The
+ * only way to change the period (apart from changing the PWM input clock) is
+ * to change the PWM clock prescaler.
+ *
+ * The prescaler is of 8 bits, so 256 prescaler values and hence 256 possible
+ * period values are supported (for a particular clock rate). The requested
+ * period will be applied only if it matches one of these 256 values.
  */
 static int sti_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
-			 int duty_ns, int period_ns)
+			  int duty_ns, int period_ns)
 {
 	struct sti_pwm_chip *pc = to_sti_pwmchip(chip);
 	struct sti_pwm_compat_data *cdata = pc->cdata;
+	unsigned int ncfg, value, prescale = 0;
 	struct pwm_device *cur = pc->cur;
 	struct device *dev = pc->dev;
-	unsigned int prescale = 0, pwmvalx;
-	int ret;
-	unsigned int ncfg;
 	bool period_same = false;
+	int ret;
 
 	ncfg = hweight_long(pc->configured);
 	if (ncfg)
 		period_same = (period_ns == pwm_get_period(cur));
 
-	/* Allow configuration changes if one of the
-	 * following conditions satisfy.
-	 * 1. No channels have been configured.
-	 * 2. Only one channel has been configured and the new request
-	 *    is for the same channel.
-	 * 3. Only one channel has been configured and the new request is
-	 *    for a new channel and period of the new channel is same as
-	 *    the current configured period.
-	 * 4. More than one channels are configured and period of the new
+	/*
+	 * Allow configuration changes if one of the following conditions
+	 * satisfy.
+	 * 1. No devices have been configured.
+	 * 2. Only one device has been configured and the new request is for
+	 *    the same device.
+	 * 3. Only one device has been configured and the new request is for
+	 *    a new device and period of the new device is same as the current
+	 *    configured period.
+	 * 4. More than one devices are configured and period of the new
 	 *    requestis the same as the current period.
 	 */
 	if (!ncfg ||
@@ -144,7 +196,11 @@ static int sti_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 	    ((ncfg == 1) && (pwm->hwpwm != cur->hwpwm) && period_same) ||
 	    ((ncfg > 1) && period_same)) {
 		/* Enable clock before writing to PWM registers. */
-		ret = clk_enable(pc->clk);
+		ret = clk_enable(pc->pwm_clk);
+		if (ret)
+			return ret;
+
+		ret = clk_enable(pc->cpt_clk);
 		if (ret)
 			return ret;
 
@@ -153,15 +209,15 @@ static int sti_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 			if (ret)
 				goto clk_dis;
 
-			ret =
-			regmap_field_write(pc->prescale_low,
-					   prescale & PWM_PRESCALE_LOW_MASK);
+			value = prescale & PWM_PRESCALE_LOW_MASK;
+
+			ret = regmap_field_write(pc->prescale_low, value);
 			if (ret)
 				goto clk_dis;
 
-			ret =
-			regmap_field_write(pc->prescale_high,
-				(prescale & PWM_PRESCALE_HIGH_MASK) >> 4);
+			value = (prescale & PWM_PRESCALE_HIGH_MASK) >> 4;
+
+			ret = regmap_field_write(pc->prescale_high, value);
 			if (ret)
 				goto clk_dis;
 		}
@@ -172,25 +228,26 @@ static int sti_pwm_config(struct pwm_chip *chip, struct pwm_device *pwm,
 		 * PWM pulse = (max_pwm_count + 1) local cycles,
 		 * that is continuous pulse: signal never goes low.
 		 */
-		pwmvalx = cdata->max_pwm_cnt * duty_ns / period_ns;
+		value = cdata->max_pwm_cnt * duty_ns / period_ns;
 
-		ret = regmap_write(pc->regmap, STI_DS_REG(pwm->hwpwm), pwmvalx);
+		ret = regmap_write(pc->regmap, PWM_OUT_VAL(pwm->hwpwm), value);
 		if (ret)
 			goto clk_dis;
 
-		ret = regmap_field_write(pc->pwm_int_en, 0);
+		ret = regmap_field_write(pc->pwm_cpt_int_en, 0);
 
 		set_bit(pwm->hwpwm, &pc->configured);
 		pc->cur = pwm;
 
-		dev_dbg(dev, "prescale:%u, period:%i, duty:%i, pwmvalx:%u\n",
-			prescale, period_ns, duty_ns, pwmvalx);
+		dev_dbg(dev, "prescale:%u, period:%i, duty:%i, value:%u\n",
+			prescale, period_ns, duty_ns, value);
 	} else {
 		return -EINVAL;
 	}
 
 clk_dis:
-	clk_disable(pc->clk);
+	clk_disable(pc->pwm_clk);
+	clk_disable(pc->cpt_clk);
 	return ret;
 }
 
@@ -201,23 +258,30 @@ static int sti_pwm_enable(struct pwm_chip *chip, struct pwm_device *pwm)
 	int ret = 0;
 
 	/*
-	 * Since we have a common enable for all PWM channels,
-	 * do not enable if already enabled.
+	 * Since we have a common enable for all PWM devices, do not enable if
+	 * already enabled.
 	 */
 	mutex_lock(&pc->sti_pwm_lock);
+
 	if (!pc->en_count) {
-		ret = clk_enable(pc->clk);
+		ret = clk_enable(pc->pwm_clk);
+		if (ret)
+			goto out;
+
+		ret = clk_enable(pc->cpt_clk);
 		if (ret)
 			goto out;
 
-		ret = regmap_field_write(pc->pwm_en, 1);
+		ret = regmap_field_write(pc->pwm_out_en, 1);
 		if (ret) {
-			dev_err(dev, "failed to enable PWM device:%d\n",
-				pwm->hwpwm);
+			dev_err(dev, "failed to enable PWM device %u: %d\n",
+				pwm->hwpwm, ret);
 			goto out;
 		}
 	}
+
 	pc->en_count++;
+
 out:
 	mutex_unlock(&pc->sti_pwm_lock);
 	return ret;
@@ -228,13 +292,17 @@ static void sti_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
 	struct sti_pwm_chip *pc = to_sti_pwmchip(chip);
 
 	mutex_lock(&pc->sti_pwm_lock);
+
 	if (--pc->en_count) {
 		mutex_unlock(&pc->sti_pwm_lock);
 		return;
 	}
-	regmap_field_write(pc->pwm_en, 0);
 
-	clk_disable(pc->clk);
+	regmap_field_write(pc->pwm_out_en, 0);
+
+	clk_disable(pc->pwm_clk);
+	clk_disable(pc->cpt_clk);
+
 	mutex_unlock(&pc->sti_pwm_lock);
 }
 
@@ -245,7 +313,90 @@ static void sti_pwm_free(struct pwm_chip *chip, struct pwm_device *pwm)
 	clear_bit(pwm->hwpwm, &pc->configured);
 }
 
+static int sti_pwm_capture(struct pwm_chip *chip, struct pwm_device *pwm,
+			   struct pwm_capture *result, unsigned long timeout)
+{
+	struct sti_pwm_chip *pc = to_sti_pwmchip(chip);
+	struct sti_pwm_compat_data *cdata = pc->cdata;
+	struct sti_cpt_ddata *ddata = pwm_get_chip_data(pwm);
+	struct device *dev = pc->dev;
+	unsigned int effective_ticks;
+	unsigned long long high, low;
+	int ret;
+
+	if (pwm->hwpwm >= cdata->cpt_num_devs) {
+		dev_err(dev, "device %u is not valid\n", pwm->hwpwm);
+		return -EINVAL;
+	}
+
+	mutex_lock(&ddata->lock);
+	ddata->index = 0;
+
+	/* Prepare capture measurement */
+	regmap_write(pc->regmap, PWM_CPT_EDGE(pwm->hwpwm), CPT_EDGE_RISING);
+	regmap_field_write(pc->pwm_cpt_int_en, BIT(pwm->hwpwm));
+
+	/* Enable capture */
+	ret = regmap_field_write(pc->pwm_cpt_en, 1);
+	if (ret) {
+		dev_err(dev, "failed to enable PWM capture %u: %d\n",
+			pwm->hwpwm, ret);
+		goto out;
+	}
+
+	ret = wait_event_interruptible_timeout(ddata->wait, ddata->index > 1,
+					       msecs_to_jiffies(timeout));
+
+	regmap_write(pc->regmap, PWM_CPT_EDGE(pwm->hwpwm), CPT_EDGE_DISABLED);
+
+	if (ret == -ERESTARTSYS)
+		goto out;
+
+	switch (ddata->index) {
+	case 0:
+	case 1:
+		/*
+		 * Getting here could mean:
+		 *  - input signal is constant of less than 1 Hz
+		 *  - there is no input signal at all
+		 *
+		 * In such case the frequency is rounded down to 0
+		 */
+		result->period = 0;
+		result->duty_cycle = 0;
+
+		break;
+
+	case 2:
+		/* We have everying we need */
+		high = ddata->snapshot[1] - ddata->snapshot[0];
+		low = ddata->snapshot[2] - ddata->snapshot[1];
+
+		effective_ticks = clk_get_rate(pc->cpt_clk);
+
+		result->period = (high + low) * NSEC_PER_SEC;
+		result->period /= effective_ticks;
+
+		result->duty_cycle = high * NSEC_PER_SEC;
+		result->duty_cycle /= effective_ticks;
+
+		break;
+
+	default:
+		dev_err(dev, "internal error\n");
+		break;
+	}
+
+out:
+	/* Disable capture */
+	regmap_field_write(pc->pwm_cpt_en, 0);
+
+	mutex_unlock(&ddata->lock);
+	return ret;
+}
+
 static const struct pwm_ops sti_pwm_ops = {
+	.capture = sti_pwm_capture,
 	.config = sti_pwm_config,
 	.enable = sti_pwm_enable,
 	.disable = sti_pwm_disable,
@@ -253,17 +404,98 @@ static const struct pwm_ops sti_pwm_ops = {
 	.owner = THIS_MODULE,
 };
 
+static irqreturn_t sti_pwm_interrupt(int irq, void *data)
+{
+	struct sti_pwm_chip *pc = data;
+	struct device *dev = pc->dev;
+	struct sti_cpt_ddata *ddata;
+	int devicenum;
+	unsigned int cpt_int_stat;
+	unsigned int reg;
+	int ret = IRQ_NONE;
+
+	ret = regmap_field_read(pc->pwm_cpt_int_stat, &cpt_int_stat);
+	if (ret)
+		return ret;
+
+	while (cpt_int_stat) {
+		devicenum = ffs(cpt_int_stat) - 1;
+
+		ddata = pwm_get_chip_data(&pc->chip.pwms[devicenum]);
+
+		/*
+		 * Capture input:
+		 *    _______                   _______
+		 *   |       |                 |       |
+		 * __|       |_________________|       |________
+		 *   ^0      ^1                ^2
+		 *
+		 * Capture start by the first available rising edge. When a
+		 * capture event occurs, capture value (CPT_VALx) is stored,
+		 * index incremented, capture edge changed.
+		 *
+		 * After the capture, if the index > 1, we have collected the
+		 * necessary data so we signal the thread waiting for it and
+		 * disable the capture by setting capture edge to none
+		 */
+
+		regmap_read(pc->regmap,
+			    PWM_CPT_VAL(devicenum),
+			    &ddata->snapshot[ddata->index]);
+
+		switch (ddata->index) {
+		case 0:
+		case 1:
+			regmap_read(pc->regmap, PWM_CPT_EDGE(devicenum), &reg);
+			reg ^= PWM_CPT_EDGE_MASK;
+			regmap_write(pc->regmap, PWM_CPT_EDGE(devicenum), reg);
+
+			ddata->index++;
+			break;
+
+		case 2:
+			regmap_write(pc->regmap,
+				     PWM_CPT_EDGE(devicenum),
+				     CPT_EDGE_DISABLED);
+			wake_up(&ddata->wait);
+			break;
+
+		default:
+			dev_err(dev, "Internal error\n");
+		}
+
+		cpt_int_stat &= ~BIT_MASK(devicenum);
+
+		ret = IRQ_HANDLED;
+	}
+
+	/* Just ACK everything */
+	regmap_write(pc->regmap, PWM_INT_ACK, PWM_INT_ACK_MASK);
+
+	return ret;
+}
+
 static int sti_pwm_probe_dt(struct sti_pwm_chip *pc)
 {
 	struct device *dev = pc->dev;
 	const struct reg_field *reg_fields;
 	struct device_node *np = dev->of_node;
 	struct sti_pwm_compat_data *cdata = pc->cdata;
-	u32 num_chan;
+	u32 num_devs;
+	int ret;
+
+	ret = of_property_read_u32(np, "st,pwm-num-chan", &num_devs);
+	if (!ret)
+		cdata->pwm_num_devs = num_devs;
+
+	ret = of_property_read_u32(np, "st,capture-num-chan", &num_devs);
+	if (!ret)
+		cdata->cpt_num_devs = num_devs;
 
-	of_property_read_u32(np, "st,pwm-num-chan", &num_chan);
-	if (num_chan)
-		cdata->num_chan = num_chan;
+	if (!cdata->pwm_num_devs && !cdata->cpt_num_devs) {
+		dev_err(dev, "No channels configured\n");
+		return -EINVAL;
+	}
 
 	reg_fields = cdata->reg_fields;
 
@@ -277,15 +509,26 @@ static int sti_pwm_probe_dt(struct sti_pwm_chip *pc)
 	if (IS_ERR(pc->prescale_high))
 		return PTR_ERR(pc->prescale_high);
 
-	pc->pwm_en = devm_regmap_field_alloc(dev, pc->regmap,
-					     reg_fields[PWM_EN]);
-	if (IS_ERR(pc->pwm_en))
-		return PTR_ERR(pc->pwm_en);
 
-	pc->pwm_int_en = devm_regmap_field_alloc(dev, pc->regmap,
-						 reg_fields[PWM_INT_EN]);
-	if (IS_ERR(pc->pwm_int_en))
-		return PTR_ERR(pc->pwm_int_en);
+	pc->pwm_out_en = devm_regmap_field_alloc(dev, pc->regmap,
+						 reg_fields[PWM_OUT_EN]);
+	if (IS_ERR(pc->pwm_out_en))
+		return PTR_ERR(pc->pwm_out_en);
+
+	pc->pwm_cpt_en = devm_regmap_field_alloc(dev, pc->regmap,
+						 reg_fields[PWM_CPT_EN]);
+	if (IS_ERR(pc->pwm_cpt_en))
+		return PTR_ERR(pc->pwm_cpt_en);
+
+	pc->pwm_cpt_int_en = devm_regmap_field_alloc(dev, pc->regmap,
+						reg_fields[PWM_CPT_INT_EN]);
+	if (IS_ERR(pc->pwm_cpt_int_en))
+		return PTR_ERR(pc->pwm_cpt_int_en);
+
+	pc->pwm_cpt_int_stat = devm_regmap_field_alloc(dev, pc->regmap,
+						reg_fields[PWM_CPT_INT_STAT]);
+	if (PTR_ERR_OR_ZERO(pc->pwm_cpt_int_stat))
+		return PTR_ERR(pc->pwm_cpt_int_stat);
 
 	return 0;
 }
@@ -302,7 +545,8 @@ static int sti_pwm_probe(struct platform_device *pdev)
 	struct sti_pwm_compat_data *cdata;
 	struct sti_pwm_chip *pc;
 	struct resource *res;
-	int ret;
+	unsigned int i;
+	int irq, ret;
 
 	pc = devm_kzalloc(dev, sizeof(*pc), GFP_KERNEL);
 	if (!pc)
@@ -323,14 +567,28 @@ static int sti_pwm_probe(struct platform_device *pdev)
 	if (IS_ERR(pc->regmap))
 		return PTR_ERR(pc->regmap);
 
+	irq = platform_get_irq(pdev, 0);
+	if (irq < 0) {
+		dev_err(&pdev->dev, "Failed to obtain IRQ\n");
+		return irq;
+	}
+
+	ret = devm_request_irq(&pdev->dev, irq, sti_pwm_interrupt, 0,
+			       pdev->name, pc);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Failed to request IRQ\n");
+		return ret;
+	}
+
 	/*
 	 * Setup PWM data with default values: some values could be replaced
 	 * with specific ones provided from Device Tree.
 	 */
-	cdata->reg_fields   = &sti_pwm_regfields[0];
+	cdata->reg_fields = sti_pwm_regfields;
 	cdata->max_prescale = 0xff;
-	cdata->max_pwm_cnt  = 255;
-	cdata->num_chan     = 1;
+	cdata->max_pwm_cnt = 255;
+	cdata->pwm_num_devs = 0;
+	cdata->cpt_num_devs = 0;
 
 	pc->cdata = cdata;
 	pc->dev = dev;
@@ -341,36 +599,64 @@ static int sti_pwm_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
-	pc->clk = of_clk_get_by_name(dev->of_node, "pwm");
-	if (IS_ERR(pc->clk)) {
+	if (!cdata->pwm_num_devs)
+		goto skip_pwm;
+
+	pc->pwm_clk = of_clk_get_by_name(dev->of_node, "pwm");
+	if (IS_ERR(pc->pwm_clk)) {
 		dev_err(dev, "failed to get PWM clock\n");
-		return PTR_ERR(pc->clk);
+		return PTR_ERR(pc->pwm_clk);
 	}
 
-	pc->clk_rate = clk_get_rate(pc->clk);
-	if (!pc->clk_rate) {
-		dev_err(dev, "failed to get clock rate\n");
-		return -EINVAL;
+	ret = clk_prepare(pc->pwm_clk);
+	if (ret) {
+		dev_err(dev, "failed to prepare clock\n");
+		return ret;
 	}
 
-	ret = clk_prepare(pc->clk);
+skip_pwm:
+	if (!cdata->cpt_num_devs)
+		goto skip_cpt;
+
+	pc->cpt_clk = of_clk_get_by_name(dev->of_node, "capture");
+	if (IS_ERR(pc->cpt_clk)) {
+		dev_err(dev, "failed to get PWM capture clock\n");
+		return PTR_ERR(pc->cpt_clk);
+	}
+
+	ret = clk_prepare(pc->cpt_clk);
 	if (ret) {
 		dev_err(dev, "failed to prepare clock\n");
 		return ret;
 	}
 
+skip_cpt:
 	pc->chip.dev = dev;
 	pc->chip.ops = &sti_pwm_ops;
 	pc->chip.base = -1;
-	pc->chip.npwm = pc->cdata->num_chan;
+	pc->chip.npwm = pc->cdata->pwm_num_devs;
 	pc->chip.can_sleep = true;
 
 	ret = pwmchip_add(&pc->chip);
 	if (ret < 0) {
-		clk_unprepare(pc->clk);
+		clk_unprepare(pc->pwm_clk);
+		clk_unprepare(pc->cpt_clk);
 		return ret;
 	}
 
+	for (i = 0; i < cdata->cpt_num_devs; i++) {
+		struct sti_cpt_ddata *ddata;
+
+		ddata = devm_kzalloc(dev, sizeof(*ddata), GFP_KERNEL);
+		if (!ddata)
+			return -ENOMEM;
+
+		init_waitqueue_head(&ddata->wait);
+		mutex_init(&ddata->lock);
+
+		pwm_set_chip_data(&pc->chip.pwms[i], ddata);
+	}
+
 	platform_set_drvdata(pdev, pc);
 
 	return 0;
@@ -381,10 +667,11 @@ static int sti_pwm_remove(struct platform_device *pdev)
 	struct sti_pwm_chip *pc = platform_get_drvdata(pdev);
 	unsigned int i;
 
-	for (i = 0; i < pc->cdata->num_chan; i++)
+	for (i = 0; i < pc->cdata->pwm_num_devs; i++)
 		pwm_disable(&pc->chip.pwms[i]);
 
-	clk_unprepare(pc->clk);
+	clk_unprepare(pc->pwm_clk);
+	clk_unprepare(pc->cpt_clk);
 
 	return pwmchip_remove(&pc->chip);
 }
diff --git a/drivers/pwm/pwm-sun4i.c b/drivers/pwm/pwm-sun4i.c
index 03a99a53c39e..b0803f6c64d9 100644
--- a/drivers/pwm/pwm-sun4i.c
+++ b/drivers/pwm/pwm-sun4i.c
@@ -284,6 +284,12 @@ static const struct sun4i_pwm_data sun4i_pwm_data_a20 = {
 	.npwm = 2,
 };
 
+static const struct sun4i_pwm_data sun4i_pwm_data_h3 = {
+	.has_prescaler_bypass = true,
+	.has_rdy = true,
+	.npwm = 1,
+};
+
 static const struct of_device_id sun4i_pwm_dt_ids[] = {
 	{
 		.compatible = "allwinner,sun4i-a10-pwm",
@@ -298,6 +304,9 @@ static const struct of_device_id sun4i_pwm_dt_ids[] = {
 		.compatible = "allwinner,sun7i-a20-pwm",
 		.data = &sun4i_pwm_data_a20,
 	}, {
+		.compatible = "allwinner,sun8i-h3-pwm",
+		.data = &sun4i_pwm_data_h3,
+	}, {
 		/* sentinel */
 	},
 };
diff --git a/drivers/pwm/pwm-tipwmss.c b/drivers/pwm/pwm-tipwmss.c
index 829f4991c96f..7fa85a1604da 100644
--- a/drivers/pwm/pwm-tipwmss.c
+++ b/drivers/pwm/pwm-tipwmss.c
@@ -34,7 +34,6 @@ static int pwmss_probe(struct platform_device *pdev)
 	struct device_node *node = pdev->dev.of_node;
 
 	pm_runtime_enable(&pdev->dev);
-	pm_runtime_get_sync(&pdev->dev);
 
 	/* Populate all the child nodes here... */
 	ret = of_platform_populate(node, NULL, NULL, &pdev->dev);
@@ -46,31 +45,13 @@ static int pwmss_probe(struct platform_device *pdev)
 
 static int pwmss_remove(struct platform_device *pdev)
 {
-	pm_runtime_put_sync(&pdev->dev);
 	pm_runtime_disable(&pdev->dev);
 	return 0;
 }
 
-#ifdef CONFIG_PM_SLEEP
-static int pwmss_suspend(struct device *dev)
-{
-	pm_runtime_put_sync(dev);
-	return 0;
-}
-
-static int pwmss_resume(struct device *dev)
-{
-	pm_runtime_get_sync(dev);
-	return 0;
-}
-#endif
-
-static SIMPLE_DEV_PM_OPS(pwmss_pm_ops, pwmss_suspend, pwmss_resume);
-
 static struct platform_driver pwmss_driver = {
 	.driver	= {
 		.name	= "pwmss",
-		.pm	= &pwmss_pm_ops,
 		.of_match_table	= pwmss_of_match,
 	},
 	.probe	= pwmss_probe,
diff --git a/drivers/pwm/pwm-twl.c b/drivers/pwm/pwm-twl.c
index 04f76725d591..7a993b056638 100644
--- a/drivers/pwm/pwm-twl.c
+++ b/drivers/pwm/pwm-twl.c
@@ -269,6 +269,22 @@ static void twl6030_pwm_disable(struct pwm_chip *chip, struct pwm_device *pwm)
 		goto out;
 	}
 
+	val |= TWL6030_PWM_TOGGLE(pwm->hwpwm, TWL6030_PWMXEN);
+
+	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, TWL6030_TOGGLE3_REG);
+	if (ret < 0) {
+		dev_err(chip->dev, "%s: Failed to disable PWM\n", pwm->label);
+		goto out;
+	}
+
+	val &= ~TWL6030_PWM_TOGGLE(pwm->hwpwm, TWL6030_PWMXEN);
+
+	ret = twl_i2c_write_u8(TWL6030_MODULE_ID1, val, TWL6030_TOGGLE3_REG);
+	if (ret < 0) {
+		dev_err(chip->dev, "%s: Failed to disable PWM\n", pwm->label);
+		goto out;
+	}
+
 	twl->twl6030_toggle3 = val;
 out:
 	mutex_unlock(&twl->mutex);
diff --git a/drivers/pwm/sysfs.c b/drivers/pwm/sysfs.c
index 18ed725594c3..0296d8178ae2 100644
--- a/drivers/pwm/sysfs.c
+++ b/drivers/pwm/sysfs.c
@@ -409,6 +409,24 @@ void pwmchip_sysfs_unexport(struct pwm_chip *chip)
 	}
 }
 
+void pwmchip_sysfs_unexport_children(struct pwm_chip *chip)
+{
+	struct device *parent;
+	unsigned int i;
+
+	parent = class_find_device(&pwm_class, NULL, chip,
+				   pwmchip_sysfs_match);
+	if (!parent)
+		return;
+
+	for (i = 0; i < chip->npwm; i++) {
+		struct pwm_device *pwm = &chip->pwms[i];
+
+		if (test_bit(PWMF_EXPORTED, &pwm->flags))
+			pwm_unexport_child(parent, pwm);
+	}
+}
+
 static int __init pwm_sysfs_init(void)
 {
 	return class_register(&pwm_class);
diff --git a/drivers/regulator/max8973-regulator.c b/drivers/regulator/max8973-regulator.c
index 3958f50c5975..e0c747aa9f85 100644
--- a/drivers/regulator/max8973-regulator.c
+++ b/drivers/regulator/max8973-regulator.c
@@ -495,7 +495,8 @@ static irqreturn_t max8973_thermal_irq(int irq, void *data)
 {
 	struct max8973_chip *mchip = data;
 
-	thermal_zone_device_update(mchip->tz_device);
+	thermal_zone_device_update(mchip->tz_device,
+				   THERMAL_EVENT_UNSPECIFIED);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/thermal/Kconfig b/drivers/thermal/Kconfig
index 2d702ca6556f..a13541bdc726 100644
--- a/drivers/thermal/Kconfig
+++ b/drivers/thermal/Kconfig
@@ -186,7 +186,7 @@ config HISI_THERMAL
 
 config IMX_THERMAL
 	tristate "Temperature sensor driver for Freescale i.MX SoCs"
-	depends on CPU_THERMAL
+	depends on (ARCH_MXC && CPU_THERMAL) || COMPILE_TEST
 	depends on MFD_SYSCON
 	depends on OF
 	help
@@ -195,6 +195,26 @@ config IMX_THERMAL
 	  cpufreq is used as the cooling device to throttle CPUs when the
 	  passive trip is crossed.
 
+config MAX77620_THERMAL
+	tristate "Temperature sensor driver for Maxim MAX77620 PMIC"
+	depends on MFD_MAX77620
+	depends on OF
+	help
+	  Support for die junction temperature warning alarm for Maxim
+	  Semiconductor PMIC MAX77620 device. Device generates two alarm
+	  interrupts when PMIC die temperature cross the threshold of
+	  120 degC and 140 degC.
+
+config QORIQ_THERMAL
+	tristate "QorIQ Thermal Monitoring Unit"
+	depends on THERMAL_OF
+	depends on HAS_IOMEM
+	help
+	  Support for Thermal Monitoring Unit (TMU) found on QorIQ platforms.
+	  It supports one critical trip point and one passive trip point. The
+	  cpufreq is used as the cooling device to throttle CPUs when the
+	  passive trip is crossed.
+
 config SPEAR_THERMAL
 	tristate "SPEAr thermal sensor driver"
 	depends on PLAT_SPEAR || COMPILE_TEST
@@ -332,6 +352,16 @@ menu "ACPI INT340X thermal drivers"
 source drivers/thermal/int340x_thermal/Kconfig
 endmenu
 
+config INTEL_BXT_PMIC_THERMAL
+	tristate "Intel Broxton PMIC thermal driver"
+	depends on X86 && INTEL_SOC_PMIC && REGMAP
+	help
+	  Select this driver for Intel Broxton PMIC with ADC channels monitoring
+	  system temperature measurements and alerts.
+	  This driver is used for monitoring the ADC channels of PMIC and handles
+	  the alert trip point interrupts and notifies the thermal framework with
+	  the trip point and temperature details of the zone.
+
 config INTEL_PCH_THERMAL
 	tristate "Intel PCH Thermal Reporting Driver"
 	depends on X86 && PCI
@@ -399,4 +429,9 @@ config GENERIC_ADC_THERMAL
 	  to this driver. This driver reports the temperature by reading ADC
 	  channel and converts it to temperature based on lookup table.
 
+menu "Qualcomm thermal drivers"
+depends on (ARCH_QCOM && OF) || COMPILE_TEST
+source "drivers/thermal/qcom/Kconfig"
+endmenu
+
 endif
diff --git a/drivers/thermal/Makefile b/drivers/thermal/Makefile
index 10b07c14f8a9..c92eb22a41ff 100644
--- a/drivers/thermal/Makefile
+++ b/drivers/thermal/Makefile
@@ -37,6 +37,8 @@ obj-$(CONFIG_DB8500_THERMAL)	+= db8500_thermal.o
 obj-$(CONFIG_ARMADA_THERMAL)	+= armada_thermal.o
 obj-$(CONFIG_TANGO_THERMAL)	+= tango_thermal.o
 obj-$(CONFIG_IMX_THERMAL)	+= imx_thermal.o
+obj-$(CONFIG_MAX77620_THERMAL)	+= max77620_thermal.o
+obj-$(CONFIG_QORIQ_THERMAL)	+= qoriq_thermal.o
 obj-$(CONFIG_DB8500_CPUFREQ_COOLING)	+= db8500_cpufreq_cooling.o
 obj-$(CONFIG_INTEL_POWERCLAMP)	+= intel_powerclamp.o
 obj-$(CONFIG_X86_PKG_TEMP_THERMAL)	+= x86_pkg_temp_thermal.o
@@ -45,8 +47,10 @@ obj-$(CONFIG_INTEL_SOC_DTS_THERMAL)	+= intel_soc_dts_thermal.o
 obj-$(CONFIG_INTEL_QUARK_DTS_THERMAL)	+= intel_quark_dts_thermal.o
 obj-$(CONFIG_TI_SOC_THERMAL)	+= ti-soc-thermal/
 obj-$(CONFIG_INT340X_THERMAL)  += int340x_thermal/
+obj-$(CONFIG_INTEL_BXT_PMIC_THERMAL) += intel_bxt_pmic_thermal.o
 obj-$(CONFIG_INTEL_PCH_THERMAL)	+= intel_pch_thermal.o
 obj-$(CONFIG_ST_THERMAL)	+= st/
+obj-$(CONFIG_QCOM_TSENS)	+= qcom/
 obj-$(CONFIG_TEGRA_SOCTHERM)	+= tegra/
 obj-$(CONFIG_HISI_THERMAL)     += hisi_thermal.o
 obj-$(CONFIG_MTK_THERMAL)	+= mtk_thermal.o
diff --git a/drivers/thermal/cpu_cooling.c b/drivers/thermal/cpu_cooling.c
index a32b41783b77..9ce0e9eef923 100644
--- a/drivers/thermal/cpu_cooling.c
+++ b/drivers/thermal/cpu_cooling.c
@@ -74,7 +74,7 @@ struct power_table {
  *	cpufreq frequencies.
  * @allowed_cpus: all the cpus involved for this cpufreq_cooling_device.
  * @node: list_head to link all cpufreq_cooling_device together.
- * @last_load: load measured by the latest call to cpufreq_get_actual_power()
+ * @last_load: load measured by the latest call to cpufreq_get_requested_power()
  * @time_in_idle: previous reading of the absolute time that this cpu was idle
  * @time_in_idle_timestamp: wall time of the last invocation of
  *	get_cpu_idle_time_us()
diff --git a/drivers/thermal/db8500_thermal.c b/drivers/thermal/db8500_thermal.c
index 652acd8fbe48..e776cea80cfc 100644
--- a/drivers/thermal/db8500_thermal.c
+++ b/drivers/thermal/db8500_thermal.c
@@ -306,7 +306,7 @@ static void db8500_thermal_work(struct work_struct *work)
 	if (cur_mode == THERMAL_DEVICE_DISABLED)
 		return;
 
-	thermal_zone_device_update(pzone->therm_dev);
+	thermal_zone_device_update(pzone->therm_dev, THERMAL_EVENT_UNSPECIFIED);
 	dev_dbg(&pzone->therm_dev->device, "thermal work finished.\n");
 }
 
diff --git a/drivers/thermal/devfreq_cooling.c b/drivers/thermal/devfreq_cooling.c
index 01f0015f80dc..81631b110e17 100644
--- a/drivers/thermal/devfreq_cooling.c
+++ b/drivers/thermal/devfreq_cooling.c
@@ -312,7 +312,7 @@ static int devfreq_cooling_state2power(struct thermal_cooling_device *cdev,
 	unsigned long freq;
 	u32 static_power;
 
-	if (state < 0 || state >= dfc->freq_table_size)
+	if (state >= dfc->freq_table_size)
 		return -EINVAL;
 
 	freq = dfc->freq_table[state];
diff --git a/drivers/thermal/gov_bang_bang.c b/drivers/thermal/gov_bang_bang.c
index bb118a152cbb..fc5e5057f0de 100644
--- a/drivers/thermal/gov_bang_bang.c
+++ b/drivers/thermal/gov_bang_bang.c
@@ -65,7 +65,7 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip)
 		if (instance->target == 0 && tz->temperature >= trip_temp)
 			instance->target = 1;
 		else if (instance->target == 1 &&
-				tz->temperature < trip_temp - trip_hyst)
+				tz->temperature <= trip_temp - trip_hyst)
 			instance->target = 0;
 
 		dev_dbg(&instance->cdev->device, "target=%d\n",
diff --git a/drivers/thermal/hisi_thermal.c b/drivers/thermal/hisi_thermal.c
index 97fad8f51e1c..f6429666a1cf 100644
--- a/drivers/thermal/hisi_thermal.c
+++ b/drivers/thermal/hisi_thermal.c
@@ -237,7 +237,8 @@ static irqreturn_t hisi_thermal_alarm_irq_thread(int irq, void *dev)
 		if (!data->sensors[i].tzd)
 			continue;
 
-		thermal_zone_device_update(data->sensors[i].tzd);
+		thermal_zone_device_update(data->sensors[i].tzd,
+					   THERMAL_EVENT_UNSPECIFIED);
 	}
 
 	return IRQ_HANDLED;
diff --git a/drivers/thermal/imx_thermal.c b/drivers/thermal/imx_thermal.c
index e473548b5d28..06912f0602b7 100644
--- a/drivers/thermal/imx_thermal.c
+++ b/drivers/thermal/imx_thermal.c
@@ -246,7 +246,7 @@ static int imx_set_mode(struct thermal_zone_device *tz,
 	}
 
 	data->mode = mode;
-	thermal_zone_device_update(tz);
+	thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
 
 	return 0;
 }
@@ -457,7 +457,7 @@ static irqreturn_t imx_thermal_alarm_irq_thread(int irq, void *dev)
 	dev_dbg(&data->tz->device, "THERMAL ALARM: T > %d\n",
 		data->alarm_temp / 1000);
 
-	thermal_zone_device_update(data->tz);
+	thermal_zone_device_update(data->tz, THERMAL_EVENT_UNSPECIFIED);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/thermal/int340x_thermal/int3402_thermal.c b/drivers/thermal/int340x_thermal/int3402_thermal.c
index 69df3d960303..8e90b3151a42 100644
--- a/drivers/thermal/int340x_thermal/int3402_thermal.c
+++ b/drivers/thermal/int340x_thermal/int3402_thermal.c
@@ -35,7 +35,8 @@ static void int3402_notify(acpi_handle handle, u32 event, void *data)
 	case INT3402_PERF_CHANGED_EVENT:
 		break;
 	case INT3402_THERMAL_EVENT:
-		int340x_thermal_zone_device_update(priv->int340x_zone);
+		int340x_thermal_zone_device_update(priv->int340x_zone,
+						   THERMAL_TRIP_VIOLATED);
 		break;
 	default:
 		break;
diff --git a/drivers/thermal/int340x_thermal/int3403_thermal.c b/drivers/thermal/int340x_thermal/int3403_thermal.c
index 50a7a08e3a15..c4890c9437eb 100644
--- a/drivers/thermal/int340x_thermal/int3403_thermal.c
+++ b/drivers/thermal/int340x_thermal/int3403_thermal.c
@@ -25,6 +25,7 @@
 #define INT3403_TYPE_CHARGER		0x0B
 #define INT3403_TYPE_BATTERY		0x0C
 #define INT3403_PERF_CHANGED_EVENT	0x80
+#define INT3403_PERF_TRIP_POINT_CHANGED	0x81
 #define INT3403_THERMAL_EVENT		0x90
 
 /* Preserved structure for future expandbility */
@@ -72,7 +73,13 @@ static void int3403_notify(acpi_handle handle,
 	case INT3403_PERF_CHANGED_EVENT:
 		break;
 	case INT3403_THERMAL_EVENT:
-		int340x_thermal_zone_device_update(obj->int340x_zone);
+		int340x_thermal_zone_device_update(obj->int340x_zone,
+						   THERMAL_TRIP_VIOLATED);
+		break;
+	case INT3403_PERF_TRIP_POINT_CHANGED:
+		int340x_thermal_read_trips(obj->int340x_zone);
+		int340x_thermal_zone_device_update(obj->int340x_zone,
+						   THERMAL_TRIP_CHANGED);
 		break;
 	default:
 		dev_err(&priv->pdev->dev, "Unsupported event [0x%x]\n", event);
diff --git a/drivers/thermal/int340x_thermal/int340x_thermal_zone.c b/drivers/thermal/int340x_thermal/int340x_thermal_zone.c
index b9b2666aa94c..145a5c53ff5c 100644
--- a/drivers/thermal/int340x_thermal/int340x_thermal_zone.c
+++ b/drivers/thermal/int340x_thermal/int340x_thermal_zone.c
@@ -177,6 +177,42 @@ static int int340x_thermal_get_trip_config(acpi_handle handle, char *name,
 	return 0;
 }
 
+int int340x_thermal_read_trips(struct int34x_thermal_zone *int34x_zone)
+{
+	int trip_cnt = int34x_zone->aux_trip_nr;
+	int i;
+
+	int34x_zone->crt_trip_id = -1;
+	if (!int340x_thermal_get_trip_config(int34x_zone->adev->handle, "_CRT",
+					     &int34x_zone->crt_temp))
+		int34x_zone->crt_trip_id = trip_cnt++;
+
+	int34x_zone->hot_trip_id = -1;
+	if (!int340x_thermal_get_trip_config(int34x_zone->adev->handle, "_HOT",
+					     &int34x_zone->hot_temp))
+		int34x_zone->hot_trip_id = trip_cnt++;
+
+	int34x_zone->psv_trip_id = -1;
+	if (!int340x_thermal_get_trip_config(int34x_zone->adev->handle, "_PSV",
+					     &int34x_zone->psv_temp))
+		int34x_zone->psv_trip_id = trip_cnt++;
+
+	for (i = 0; i < INT340X_THERMAL_MAX_ACT_TRIP_COUNT; i++) {
+		char name[5] = { '_', 'A', 'C', '0' + i, '\0' };
+
+		if (int340x_thermal_get_trip_config(int34x_zone->adev->handle,
+					name,
+					&int34x_zone->act_trips[i].temp))
+			break;
+
+		int34x_zone->act_trips[i].id = trip_cnt++;
+		int34x_zone->act_trips[i].valid = true;
+	}
+
+	return trip_cnt;
+}
+EXPORT_SYMBOL_GPL(int340x_thermal_read_trips);
+
 static struct thermal_zone_params int340x_thermal_params = {
 	.governor_name = "user_space",
 	.no_hwmon = true,
@@ -188,7 +224,7 @@ struct int34x_thermal_zone *int340x_thermal_zone_add(struct acpi_device *adev,
 	struct int34x_thermal_zone *int34x_thermal_zone;
 	acpi_status status;
 	unsigned long long trip_cnt;
-	int trip_mask = 0, i;
+	int trip_mask = 0;
 	int ret;
 
 	int34x_thermal_zone = kzalloc(sizeof(*int34x_thermal_zone),
@@ -214,28 +250,8 @@ struct int34x_thermal_zone *int340x_thermal_zone_add(struct acpi_device *adev,
 		int34x_thermal_zone->aux_trip_nr = trip_cnt;
 	}
 
-	int34x_thermal_zone->crt_trip_id = -1;
-	if (!int340x_thermal_get_trip_config(adev->handle, "_CRT",
-					     &int34x_thermal_zone->crt_temp))
-		int34x_thermal_zone->crt_trip_id = trip_cnt++;
-	int34x_thermal_zone->hot_trip_id = -1;
-	if (!int340x_thermal_get_trip_config(adev->handle, "_HOT",
-					     &int34x_thermal_zone->hot_temp))
-		int34x_thermal_zone->hot_trip_id = trip_cnt++;
-	int34x_thermal_zone->psv_trip_id = -1;
-	if (!int340x_thermal_get_trip_config(adev->handle, "_PSV",
-					     &int34x_thermal_zone->psv_temp))
-		int34x_thermal_zone->psv_trip_id = trip_cnt++;
-	for (i = 0; i < INT340X_THERMAL_MAX_ACT_TRIP_COUNT; i++) {
-		char name[5] = { '_', 'A', 'C', '0' + i, '\0' };
+	trip_cnt = int340x_thermal_read_trips(int34x_thermal_zone);
 
-		if (int340x_thermal_get_trip_config(adev->handle, name,
-				&int34x_thermal_zone->act_trips[i].temp))
-			break;
-
-		int34x_thermal_zone->act_trips[i].id = trip_cnt++;
-		int34x_thermal_zone->act_trips[i].valid = true;
-	}
 	int34x_thermal_zone->lpat_table = acpi_lpat_get_conversion_table(
 								adev->handle);
 
diff --git a/drivers/thermal/int340x_thermal/int340x_thermal_zone.h b/drivers/thermal/int340x_thermal/int340x_thermal_zone.h
index aaadf724ff2e..5f3ba4775c5c 100644
--- a/drivers/thermal/int340x_thermal/int340x_thermal_zone.h
+++ b/drivers/thermal/int340x_thermal/int340x_thermal_zone.h
@@ -46,6 +46,7 @@ struct int34x_thermal_zone {
 struct int34x_thermal_zone *int340x_thermal_zone_add(struct acpi_device *,
 				struct thermal_zone_device_ops *override_ops);
 void int340x_thermal_zone_remove(struct int34x_thermal_zone *);
+int int340x_thermal_read_trips(struct int34x_thermal_zone *int34x_zone);
 
 static inline void int340x_thermal_zone_set_priv_data(
 			struct int34x_thermal_zone *tzone, void *priv_data)
@@ -60,9 +61,10 @@ static inline void *int340x_thermal_zone_get_priv_data(
 }
 
 static inline void int340x_thermal_zone_device_update(
-			struct int34x_thermal_zone *tzone)
+					struct int34x_thermal_zone *tzone,
+					enum thermal_notify_event event)
 {
-	thermal_zone_device_update(tzone->zone);
+	thermal_zone_device_update(tzone->zone, event);
 }
 
 #endif
diff --git a/drivers/thermal/int340x_thermal/processor_thermal_device.c b/drivers/thermal/int340x_thermal/processor_thermal_device.c
index 42c1ac057bad..ff3b36f339e3 100644
--- a/drivers/thermal/int340x_thermal/processor_thermal_device.c
+++ b/drivers/thermal/int340x_thermal/processor_thermal_device.c
@@ -258,7 +258,8 @@ static void proc_thermal_notify(acpi_handle handle, u32 event, void *data)
 	switch (event) {
 	case PROC_POWER_CAPABILITY_CHANGED:
 		proc_thermal_read_ppcc(proc_priv);
-		int340x_thermal_zone_device_update(proc_priv->int340x_zone);
+		int340x_thermal_zone_device_update(proc_priv->int340x_zone,
+				THERMAL_DEVICE_POWER_CAPABILITY_CHANGED);
 		break;
 	default:
 		dev_err(proc_priv->dev, "Unsupported event [0x%x]\n", event);
diff --git a/drivers/thermal/intel_bxt_pmic_thermal.c b/drivers/thermal/intel_bxt_pmic_thermal.c
new file mode 100644
index 000000000000..0f19a393ddd8
--- /dev/null
+++ b/drivers/thermal/intel_bxt_pmic_thermal.c
@@ -0,0 +1,300 @@
+/*
+ * Intel Broxton PMIC thermal driver
+ *
+ * Copyright (C) 2016 Intel Corporation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License version
+ * 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/kernel.h>
+#include <linux/slab.h>
+#include <linux/delay.h>
+#include <linux/interrupt.h>
+#include <linux/device.h>
+#include <linux/thermal.h>
+#include <linux/platform_device.h>
+#include <linux/sched.h>
+#include <linux/mfd/intel_soc_pmic.h>
+
+#define BXTWC_THRM0IRQ		0x4E04
+#define BXTWC_THRM1IRQ		0x4E05
+#define BXTWC_THRM2IRQ		0x4E06
+#define BXTWC_MTHRM0IRQ		0x4E12
+#define BXTWC_MTHRM1IRQ		0x4E13
+#define BXTWC_MTHRM2IRQ		0x4E14
+#define BXTWC_STHRM0IRQ		0x4F19
+#define BXTWC_STHRM1IRQ		0x4F1A
+#define BXTWC_STHRM2IRQ		0x4F1B
+
+struct trip_config_map {
+	u16 irq_reg;
+	u16 irq_en;
+	u16 evt_stat;
+	u8 irq_mask;
+	u8 irq_en_mask;
+	u8 evt_mask;
+	u8 trip_num;
+};
+
+struct thermal_irq_map {
+	char handle[20];
+	int num_trips;
+	const struct trip_config_map *trip_config;
+};
+
+struct pmic_thermal_data {
+	const struct thermal_irq_map *maps;
+	int num_maps;
+};
+
+static const struct trip_config_map bxtwc_str0_trip_config[] = {
+	{
+		.irq_reg = BXTWC_THRM0IRQ,
+		.irq_mask = 0x01,
+		.irq_en = BXTWC_MTHRM0IRQ,
+		.irq_en_mask = 0x01,
+		.evt_stat = BXTWC_STHRM0IRQ,
+		.evt_mask = 0x01,
+		.trip_num = 0
+	},
+	{
+		.irq_reg = BXTWC_THRM0IRQ,
+		.irq_mask = 0x10,
+		.irq_en = BXTWC_MTHRM0IRQ,
+		.irq_en_mask = 0x10,
+		.evt_stat = BXTWC_STHRM0IRQ,
+		.evt_mask = 0x10,
+		.trip_num = 1
+	}
+};
+
+static const struct trip_config_map bxtwc_str1_trip_config[] = {
+	{
+		.irq_reg = BXTWC_THRM0IRQ,
+		.irq_mask = 0x02,
+		.irq_en = BXTWC_MTHRM0IRQ,
+		.irq_en_mask = 0x02,
+		.evt_stat = BXTWC_STHRM0IRQ,
+		.evt_mask = 0x02,
+		.trip_num = 0
+	},
+	{
+		.irq_reg = BXTWC_THRM0IRQ,
+		.irq_mask = 0x20,
+		.irq_en = BXTWC_MTHRM0IRQ,
+		.irq_en_mask = 0x20,
+		.evt_stat = BXTWC_STHRM0IRQ,
+		.evt_mask = 0x20,
+		.trip_num = 1
+	},
+};
+
+static const struct trip_config_map bxtwc_str2_trip_config[] = {
+	{
+		.irq_reg = BXTWC_THRM0IRQ,
+		.irq_mask = 0x04,
+		.irq_en = BXTWC_MTHRM0IRQ,
+		.irq_en_mask = 0x04,
+		.evt_stat = BXTWC_STHRM0IRQ,
+		.evt_mask = 0x04,
+		.trip_num = 0
+	},
+	{
+		.irq_reg = BXTWC_THRM0IRQ,
+		.irq_mask = 0x40,
+		.irq_en = BXTWC_MTHRM0IRQ,
+		.irq_en_mask = 0x40,
+		.evt_stat = BXTWC_STHRM0IRQ,
+		.evt_mask = 0x40,
+		.trip_num = 1
+	},
+};
+
+static const struct trip_config_map bxtwc_str3_trip_config[] = {
+	{
+		.irq_reg = BXTWC_THRM2IRQ,
+		.irq_mask = 0x10,
+		.irq_en = BXTWC_MTHRM2IRQ,
+		.irq_en_mask = 0x10,
+		.evt_stat = BXTWC_STHRM2IRQ,
+		.evt_mask = 0x10,
+		.trip_num = 0
+	},
+};
+
+static const struct thermal_irq_map bxtwc_thermal_irq_map[] = {
+	{
+		.handle = "STR0",
+		.trip_config = bxtwc_str0_trip_config,
+		.num_trips = ARRAY_SIZE(bxtwc_str0_trip_config),
+	},
+	{
+		.handle = "STR1",
+		.trip_config = bxtwc_str1_trip_config,
+		.num_trips = ARRAY_SIZE(bxtwc_str1_trip_config),
+	},
+	{
+		.handle = "STR2",
+		.trip_config = bxtwc_str2_trip_config,
+		.num_trips = ARRAY_SIZE(bxtwc_str2_trip_config),
+	},
+	{
+		.handle = "STR3",
+		.trip_config = bxtwc_str3_trip_config,
+		.num_trips = ARRAY_SIZE(bxtwc_str3_trip_config),
+	},
+};
+
+static const struct pmic_thermal_data bxtwc_thermal_data = {
+	.maps = bxtwc_thermal_irq_map,
+	.num_maps = ARRAY_SIZE(bxtwc_thermal_irq_map),
+};
+
+static irqreturn_t pmic_thermal_irq_handler(int irq, void *data)
+{
+	struct platform_device *pdev = data;
+	struct thermal_zone_device *tzd;
+	struct pmic_thermal_data *td;
+	struct intel_soc_pmic *pmic;
+	struct regmap *regmap;
+	u8 reg_val, mask, irq_stat, trip;
+	u16 reg, evt_stat_reg;
+	int i, j, ret;
+
+	pmic = dev_get_drvdata(pdev->dev.parent);
+	regmap = pmic->regmap;
+	td = (struct pmic_thermal_data *)
+		platform_get_device_id(pdev)->driver_data;
+
+	/* Resolve thermal irqs */
+	for (i = 0; i < td->num_maps; i++) {
+		for (j = 0; j < td->maps[i].num_trips; j++) {
+			reg = td->maps[i].trip_config[j].irq_reg;
+			mask = td->maps[i].trip_config[j].irq_mask;
+			/*
+			 * Read the irq register to resolve whether the
+			 * interrupt was triggered for this sensor
+			 */
+			if (regmap_read(regmap, reg, &ret))
+				return IRQ_HANDLED;
+
+			reg_val = (u8)ret;
+			irq_stat = ((u8)ret & mask);
+
+			if (!irq_stat)
+				continue;
+
+			/*
+			 * Read the status register to find out what
+			 * event occurred i.e a high or a low
+			 */
+			evt_stat_reg = td->maps[i].trip_config[j].evt_stat;
+			if (regmap_read(regmap, evt_stat_reg, &ret))
+				return IRQ_HANDLED;
+
+			trip = td->maps[i].trip_config[j].trip_num;
+			tzd = thermal_zone_get_zone_by_name(td->maps[i].handle);
+			if (!IS_ERR(tzd))
+				thermal_zone_device_update(tzd,
+						THERMAL_EVENT_UNSPECIFIED);
+
+			/* Clear the appropriate irq */
+			regmap_write(regmap, reg, reg_val & mask);
+		}
+	}
+
+	return IRQ_HANDLED;
+}
+
+static int pmic_thermal_probe(struct platform_device *pdev)
+{
+	struct regmap_irq_chip_data *regmap_irq_chip;
+	struct pmic_thermal_data *thermal_data;
+	int ret, irq, virq, i, j, pmic_irq_count;
+	struct intel_soc_pmic *pmic;
+	struct regmap *regmap;
+	struct device *dev;
+	u16 reg;
+	u8 mask;
+
+	dev = &pdev->dev;
+	pmic = dev_get_drvdata(pdev->dev.parent);
+	if (!pmic) {
+		dev_err(dev, "Failed to get struct intel_soc_pmic pointer\n");
+		return -ENODEV;
+	}
+
+	thermal_data = (struct pmic_thermal_data *)
+				platform_get_device_id(pdev)->driver_data;
+	if (!thermal_data) {
+		dev_err(dev, "No thermal data initialized!!\n");
+		return -ENODEV;
+	}
+
+	regmap = pmic->regmap;
+	regmap_irq_chip = pmic->irq_chip_data_level2;
+
+	pmic_irq_count = 0;
+	while ((irq = platform_get_irq(pdev, pmic_irq_count)) != -ENXIO) {
+		virq = regmap_irq_get_virq(regmap_irq_chip, irq);
+		if (virq < 0) {
+			dev_err(dev, "failed to get virq by irq %d\n", irq);
+			return virq;
+		}
+
+		ret = devm_request_threaded_irq(&pdev->dev, virq,
+				NULL, pmic_thermal_irq_handler,
+				IRQF_ONESHOT, "pmic_thermal", pdev);
+
+		if (ret) {
+			dev_err(dev, "request irq(%d) failed: %d\n", virq, ret);
+			return ret;
+		}
+		pmic_irq_count++;
+	}
+
+	/* Enable thermal interrupts */
+	for (i = 0; i < thermal_data->num_maps; i++) {
+		for (j = 0; j < thermal_data->maps[i].num_trips; j++) {
+			reg = thermal_data->maps[i].trip_config[j].irq_en;
+			mask = thermal_data->maps[i].trip_config[j].irq_en_mask;
+			ret = regmap_update_bits(regmap, reg, mask, 0x00);
+			if (ret)
+				return ret;
+		}
+	}
+
+	return 0;
+}
+
+static const struct platform_device_id pmic_thermal_id_table[] = {
+	{
+		.name = "bxt_wcove_thermal",
+		.driver_data = (kernel_ulong_t)&bxtwc_thermal_data,
+	},
+	{},
+};
+
+static struct platform_driver pmic_thermal_driver = {
+	.probe = pmic_thermal_probe,
+	.driver = {
+		.name = "pmic_thermal",
+	},
+	.id_table = pmic_thermal_id_table,
+};
+
+MODULE_DEVICE_TABLE(platform, pmic_thermal_id_table);
+module_platform_driver(pmic_thermal_driver);
+
+MODULE_AUTHOR("Yegnesh S Iyer <yegnesh.s.iyer@intel.com>");
+MODULE_DESCRIPTION("Intel Broxton PMIC Thermal Driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/intel_soc_dts_iosf.c b/drivers/thermal/intel_soc_dts_iosf.c
index f72e1db3216f..e0813dfaa278 100644
--- a/drivers/thermal/intel_soc_dts_iosf.c
+++ b/drivers/thermal/intel_soc_dts_iosf.c
@@ -391,7 +391,8 @@ void intel_soc_dts_iosf_interrupt_handler(struct intel_soc_dts_sensors *sensors)
 
 		for (i = 0; i < SOC_MAX_DTS_SENSORS; ++i) {
 			pr_debug("TZD update for zone %d\n", i);
-			thermal_zone_device_update(sensors->soc_dts[i].tzone);
+			thermal_zone_device_update(sensors->soc_dts[i].tzone,
+						   THERMAL_EVENT_UNSPECIFIED);
 		}
 	} else
 		spin_unlock_irqrestore(&sensors->intr_notify_lock, flags);
diff --git a/drivers/thermal/max77620_thermal.c b/drivers/thermal/max77620_thermal.c
new file mode 100644
index 000000000000..83905ff46e40
--- /dev/null
+++ b/drivers/thermal/max77620_thermal.c
@@ -0,0 +1,166 @@
+/*
+ * Junction temperature thermal driver for Maxim Max77620.
+ *
+ * Copyright (c) 2016, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Author: Laxman Dewangan <ldewangan@nvidia.com>
+ *	   Mallikarjun Kasoju <mkasoju@nvidia.com>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ */
+
+#include <linux/irq.h>
+#include <linux/interrupt.h>
+#include <linux/mfd/max77620.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include <linux/slab.h>
+#include <linux/thermal.h>
+
+#define MAX77620_NORMAL_OPERATING_TEMP	100000
+#define MAX77620_TJALARM1_TEMP		120000
+#define MAX77620_TJALARM2_TEMP		140000
+
+struct max77620_therm_info {
+	struct device			*dev;
+	struct regmap			*rmap;
+	struct thermal_zone_device	*tz_device;
+	int				irq_tjalarm1;
+	int				irq_tjalarm2;
+};
+
+/**
+ * max77620_thermal_read_temp: Read PMIC die temperatue.
+ * @data:	Device specific data.
+ * temp:	Temperature in millidegrees Celsius
+ *
+ * The actual temperature of PMIC die is not available from PMIC.
+ * PMIC only tells the status if it has crossed or not the threshold level
+ * of 120degC or 140degC.
+ * If threshold has not been crossed then assume die temperature as 100degC
+ * else 120degC or 140deG based on the PMIC die temp threshold status.
+ *
+ * Return 0 on success otherwise error number to show reason of failure.
+ */
+
+static int max77620_thermal_read_temp(void *data, int *temp)
+{
+	struct max77620_therm_info *mtherm = data;
+	unsigned int val;
+	int ret;
+
+	ret = regmap_read(mtherm->rmap, MAX77620_REG_STATLBT, &val);
+	if (ret < 0) {
+		dev_err(mtherm->dev, "Failed to read STATLBT: %d\n", ret);
+		return ret;
+	}
+
+	if (val & MAX77620_IRQ_TJALRM2_MASK)
+		*temp = MAX77620_TJALARM2_TEMP;
+	else if (val & MAX77620_IRQ_TJALRM1_MASK)
+		*temp = MAX77620_TJALARM1_TEMP;
+	else
+		*temp = MAX77620_NORMAL_OPERATING_TEMP;
+
+	return 0;
+}
+
+static const struct thermal_zone_of_device_ops max77620_thermal_ops = {
+	.get_temp = max77620_thermal_read_temp,
+};
+
+static irqreturn_t max77620_thermal_irq(int irq, void *data)
+{
+	struct max77620_therm_info *mtherm = data;
+
+	if (irq == mtherm->irq_tjalarm1)
+		dev_warn(mtherm->dev, "Junction Temp Alarm1(120C) occurred\n");
+	else if (irq == mtherm->irq_tjalarm2)
+		dev_crit(mtherm->dev, "Junction Temp Alarm2(140C) occurred\n");
+
+	thermal_zone_device_update(mtherm->tz_device,
+				   THERMAL_EVENT_UNSPECIFIED);
+
+	return IRQ_HANDLED;
+}
+
+static int max77620_thermal_probe(struct platform_device *pdev)
+{
+	struct max77620_therm_info *mtherm;
+	int ret;
+
+	mtherm = devm_kzalloc(&pdev->dev, sizeof(*mtherm), GFP_KERNEL);
+	if (!mtherm)
+		return -ENOMEM;
+
+	mtherm->irq_tjalarm1 = platform_get_irq(pdev, 0);
+	mtherm->irq_tjalarm2 = platform_get_irq(pdev, 1);
+	if ((mtherm->irq_tjalarm1 < 0) || (mtherm->irq_tjalarm2 < 0)) {
+		dev_err(&pdev->dev, "Alarm irq number not available\n");
+		return -EINVAL;
+	}
+
+	pdev->dev.of_node = pdev->dev.parent->of_node;
+
+	mtherm->dev = &pdev->dev;
+	mtherm->rmap = dev_get_regmap(pdev->dev.parent, NULL);
+	if (!mtherm->rmap) {
+		dev_err(&pdev->dev, "Failed to get parent regmap\n");
+		return -ENODEV;
+	}
+
+	mtherm->tz_device = devm_thermal_zone_of_sensor_register(&pdev->dev, 0,
+				mtherm, &max77620_thermal_ops);
+	if (IS_ERR(mtherm->tz_device)) {
+		ret = PTR_ERR(mtherm->tz_device);
+		dev_err(&pdev->dev, "Failed to register thermal zone: %d\n",
+			ret);
+		return ret;
+	}
+
+	ret = devm_request_threaded_irq(&pdev->dev, mtherm->irq_tjalarm1, NULL,
+					max77620_thermal_irq,
+					IRQF_ONESHOT | IRQF_SHARED,
+					dev_name(&pdev->dev), mtherm);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Failed to request irq1: %d\n", ret);
+		return ret;
+	}
+
+	ret = devm_request_threaded_irq(&pdev->dev, mtherm->irq_tjalarm2, NULL,
+					max77620_thermal_irq,
+					IRQF_ONESHOT | IRQF_SHARED,
+					dev_name(&pdev->dev), mtherm);
+	if (ret < 0) {
+		dev_err(&pdev->dev, "Failed to request irq2: %d\n", ret);
+		return ret;
+	}
+
+	platform_set_drvdata(pdev, mtherm);
+
+	return 0;
+}
+
+static struct platform_device_id max77620_thermal_devtype[] = {
+	{ .name = "max77620-thermal", },
+	{},
+};
+
+static struct platform_driver max77620_thermal_driver = {
+	.driver = {
+		.name = "max77620-thermal",
+	},
+	.probe = max77620_thermal_probe,
+	.id_table = max77620_thermal_devtype,
+};
+
+module_platform_driver(max77620_thermal_driver);
+
+MODULE_DESCRIPTION("Max77620 Junction temperature Thermal driver");
+MODULE_AUTHOR("Laxman Dewangan <ldewangan@nvidia.com>");
+MODULE_AUTHOR("Mallikarjun Kasoju <mkasoju@nvidia.com>");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/mtk_thermal.c b/drivers/thermal/mtk_thermal.c
index 262ab0a2266f..34169c32d495 100644
--- a/drivers/thermal/mtk_thermal.c
+++ b/drivers/thermal/mtk_thermal.c
@@ -2,6 +2,7 @@
  * Copyright (c) 2015 MediaTek Inc.
  * Author: Hanyi Wu <hanyi.wu@mediatek.com>
  *         Sascha Hauer <s.hauer@pengutronix.de>
+ *         Dawei Chien <dawei.chien@mediatek.com>
  *
  * This program is free software; you can redistribute it and/or modify
  * it under the terms of the GNU General Public License version 2 as
@@ -21,6 +22,7 @@
 #include <linux/nvmem-consumer.h>
 #include <linux/of.h>
 #include <linux/of_address.h>
+#include <linux/of_device.h>
 #include <linux/platform_device.h>
 #include <linux/slab.h>
 #include <linux/io.h>
@@ -88,6 +90,7 @@
 #define TEMP_ADCVALIDMASK_VALID_HIGH		BIT(5)
 #define TEMP_ADCVALIDMASK_VALID_POS(bit)	(bit)
 
+/* MT8173 thermal sensors */
 #define MT8173_TS1	0
 #define MT8173_TS2	1
 #define MT8173_TS3	2
@@ -106,7 +109,12 @@
 /* The number of sensing points per bank */
 #define MT8173_NUM_SENSORS_PER_ZONE	4
 
-/* Layout of the fuses providing the calibration data */
+/*
+ * Layout of the fuses providing the calibration data
+ * These macros could be used for both MT8173 and MT2701.
+ * MT8173 has five sensors and need five VTS calibration data,
+ * and MT2701 has three sensors and need three VTS calibration data.
+ */
 #define MT8173_CALIB_BUF0_VALID		BIT(0)
 #define MT8173_CALIB_BUF1_ADC_GE(x)	(((x) >> 22) & 0x3ff)
 #define MT8173_CALIB_BUF0_VTS_TS1(x)	(((x) >> 17) & 0x1ff)
@@ -117,24 +125,50 @@
 #define MT8173_CALIB_BUF0_DEGC_CALI(x)	(((x) >> 1) & 0x3f)
 #define MT8173_CALIB_BUF0_O_SLOPE(x)	(((x) >> 26) & 0x3f)
 
+/* MT2701 thermal sensors */
+#define MT2701_TS1	0
+#define MT2701_TS2	1
+#define MT2701_TSABB	2
+
+/* AUXADC channel 11 is used for the temperature sensors */
+#define MT2701_TEMP_AUXADC_CHANNEL	11
+
+/* The total number of temperature sensors in the MT2701 */
+#define MT2701_NUM_SENSORS	3
+
 #define THERMAL_NAME    "mtk-thermal"
 
+/* The number of sensing points per bank */
+#define MT2701_NUM_SENSORS_PER_ZONE	3
+
 struct mtk_thermal;
 
+struct thermal_bank_cfg {
+	unsigned int num_sensors;
+	const int *sensors;
+};
+
 struct mtk_thermal_bank {
 	struct mtk_thermal *mt;
 	int id;
 };
 
+struct mtk_thermal_data {
+	s32 num_banks;
+	s32 num_sensors;
+	s32 auxadc_channel;
+	const int *sensor_mux_values;
+	const int *msr;
+	const int *adcpnp;
+	struct thermal_bank_cfg bank_data[];
+};
+
 struct mtk_thermal {
 	struct device *dev;
 	void __iomem *thermal_base;
 
 	struct clk *clk_peri_therm;
 	struct clk *clk_auxadc;
-
-	struct mtk_thermal_bank banks[MT8173_NUM_ZONES];
-
 	/* lock: for getting and putting banks */
 	struct mutex lock;
 
@@ -144,16 +178,44 @@ struct mtk_thermal {
 	s32 o_slope;
 	s32 vts[MT8173_NUM_SENSORS];
 
+	const struct mtk_thermal_data *conf;
+	struct mtk_thermal_bank banks[];
 };
 
-struct mtk_thermal_bank_cfg {
-	unsigned int num_sensors;
-	unsigned int sensors[MT8173_NUM_SENSORS_PER_ZONE];
+/* MT8173 thermal sensor data */
+const int mt8173_bank_data[MT8173_NUM_ZONES][3] = {
+	{ MT8173_TS2, MT8173_TS3 },
+	{ MT8173_TS2, MT8173_TS4 },
+	{ MT8173_TS1, MT8173_TS2, MT8173_TSABB },
+	{ MT8173_TS2 },
 };
 
-static const int sensor_mux_values[MT8173_NUM_SENSORS] = { 0, 1, 2, 3, 16 };
+const int mt8173_msr[MT8173_NUM_SENSORS_PER_ZONE] = {
+	TEMP_MSR0, TEMP_MSR1, TEMP_MSR2, TEMP_MSR2
+};
 
-/*
+const int mt8173_adcpnp[MT8173_NUM_SENSORS_PER_ZONE] = {
+	TEMP_ADCPNP0, TEMP_ADCPNP1, TEMP_ADCPNP2, TEMP_ADCPNP3
+};
+
+const int mt8173_mux_values[MT8173_NUM_SENSORS] = { 0, 1, 2, 3, 16 };
+
+/* MT2701 thermal sensor data */
+const int mt2701_bank_data[MT2701_NUM_SENSORS] = {
+	MT2701_TS1, MT2701_TS2, MT2701_TSABB
+};
+
+const int mt2701_msr[MT2701_NUM_SENSORS_PER_ZONE] = {
+	TEMP_MSR0, TEMP_MSR1, TEMP_MSR2
+};
+
+const int mt2701_adcpnp[MT2701_NUM_SENSORS_PER_ZONE] = {
+	TEMP_ADCPNP0, TEMP_ADCPNP1, TEMP_ADCPNP2
+};
+
+const int mt2701_mux_values[MT2701_NUM_SENSORS] = { 0, 1, 16 };
+
+/**
  * The MT8173 thermal controller has four banks. Each bank can read up to
  * four temperature sensors simultaneously. The MT8173 has a total of 5
  * temperature sensors. We use each bank to measure a certain area of the
@@ -166,42 +228,53 @@ static const int sensor_mux_values[MT8173_NUM_SENSORS] = { 0, 1, 2, 3, 16 };
  * data, and this indeed needs the temperatures of the individual banks
  * for making better decisions.
  */
-static const struct mtk_thermal_bank_cfg bank_data[] = {
-	{
-		.num_sensors = 2,
-		.sensors = { MT8173_TS2, MT8173_TS3 },
-	}, {
-		.num_sensors = 2,
-		.sensors = { MT8173_TS2, MT8173_TS4 },
-	}, {
-		.num_sensors = 3,
-		.sensors = { MT8173_TS1, MT8173_TS2, MT8173_TSABB },
-	}, {
-		.num_sensors = 1,
-		.sensors = { MT8173_TS2 },
+static const struct mtk_thermal_data mt8173_thermal_data = {
+	.auxadc_channel = MT8173_TEMP_AUXADC_CHANNEL,
+	.num_banks = MT8173_NUM_ZONES,
+	.num_sensors = MT8173_NUM_SENSORS,
+	.bank_data = {
+		{
+			.num_sensors = 2,
+			.sensors = mt8173_bank_data[0],
+		}, {
+			.num_sensors = 2,
+			.sensors = mt8173_bank_data[1],
+		}, {
+			.num_sensors = 3,
+			.sensors = mt8173_bank_data[2],
+		}, {
+			.num_sensors = 1,
+			.sensors = mt8173_bank_data[3],
+		},
 	},
+	.msr = mt8173_msr,
+	.adcpnp = mt8173_adcpnp,
+	.sensor_mux_values = mt8173_mux_values,
 };
 
-struct mtk_thermal_sense_point {
-	int msr;
-	int adcpnp;
-};
-
-static const struct mtk_thermal_sense_point
-		sensing_points[MT8173_NUM_SENSORS_PER_ZONE] = {
-	{
-		.msr = TEMP_MSR0,
-		.adcpnp = TEMP_ADCPNP0,
-	}, {
-		.msr = TEMP_MSR1,
-		.adcpnp = TEMP_ADCPNP1,
-	}, {
-		.msr = TEMP_MSR2,
-		.adcpnp = TEMP_ADCPNP2,
-	}, {
-		.msr = TEMP_MSR3,
-		.adcpnp = TEMP_ADCPNP3,
+/**
+ * The MT2701 thermal controller has one bank, which can read up to
+ * three temperature sensors simultaneously. The MT2701 has a total of 3
+ * temperature sensors.
+ *
+ * The thermal core only gets the maximum temperature of this one bank,
+ * so the bank concept wouldn't be necessary here. However, the SVS (Smart
+ * Voltage Scaling) unit makes its decisions based on the same bank
+ * data.
+ */
+static const struct mtk_thermal_data mt2701_thermal_data = {
+	.auxadc_channel = MT2701_TEMP_AUXADC_CHANNEL,
+	.num_banks = 1,
+	.num_sensors = MT2701_NUM_SENSORS,
+	.bank_data = {
+		{
+			.num_sensors = 3,
+			.sensors = mt2701_bank_data,
+		},
 	},
+	.msr = mt2701_msr,
+	.adcpnp = mt2701_adcpnp,
+	.sensor_mux_values = mt2701_mux_values,
 };
 
 /**
@@ -270,13 +343,16 @@ static void mtk_thermal_put_bank(struct mtk_thermal_bank *bank)
 static int mtk_thermal_bank_temperature(struct mtk_thermal_bank *bank)
 {
 	struct mtk_thermal *mt = bank->mt;
+	const struct mtk_thermal_data *conf = mt->conf;
 	int i, temp = INT_MIN, max = INT_MIN;
 	u32 raw;
 
-	for (i = 0; i < bank_data[bank->id].num_sensors; i++) {
-		raw = readl(mt->thermal_base + sensing_points[i].msr);
+	for (i = 0; i < conf->bank_data[bank->id].num_sensors; i++) {
+		raw = readl(mt->thermal_base + conf->msr[i]);
 
-		temp = raw_to_mcelsius(mt, bank_data[bank->id].sensors[i], raw);
+		temp = raw_to_mcelsius(mt,
+				       conf->bank_data[bank->id].sensors[i],
+				       raw);
 
 		/*
 		 * The first read of a sensor often contains very high bogus
@@ -299,7 +375,7 @@ static int mtk_read_temp(void *data, int *temperature)
 	int i;
 	int tempmax = INT_MIN;
 
-	for (i = 0; i < MT8173_NUM_ZONES; i++) {
+	for (i = 0; i < mt->conf->num_banks; i++) {
 		struct mtk_thermal_bank *bank = &mt->banks[i];
 
 		mtk_thermal_get_bank(bank);
@@ -322,7 +398,7 @@ static void mtk_thermal_init_bank(struct mtk_thermal *mt, int num,
 				  u32 apmixed_phys_base, u32 auxadc_phys_base)
 {
 	struct mtk_thermal_bank *bank = &mt->banks[num];
-	const struct mtk_thermal_bank_cfg *cfg = &bank_data[num];
+	const struct mtk_thermal_data *conf = mt->conf;
 	int i;
 
 	bank->id = num;
@@ -368,7 +444,7 @@ static void mtk_thermal_init_bank(struct mtk_thermal *mt, int num,
 	 * this value will be stored to TEMP_PNPMUXADDR (TEMP_SPARE0)
 	 * automatically by hw
 	 */
-	writel(BIT(MT8173_TEMP_AUXADC_CHANNEL), mt->thermal_base + TEMP_ADCMUX);
+	writel(BIT(conf->auxadc_channel), mt->thermal_base + TEMP_ADCMUX);
 
 	/* AHB address for auxadc mux selection */
 	writel(auxadc_phys_base + AUXADC_CON1_CLR_V,
@@ -379,18 +455,18 @@ static void mtk_thermal_init_bank(struct mtk_thermal *mt, int num,
 	       mt->thermal_base + TEMP_PNPMUXADDR);
 
 	/* AHB value for auxadc enable */
-	writel(BIT(MT8173_TEMP_AUXADC_CHANNEL), mt->thermal_base + TEMP_ADCEN);
+	writel(BIT(conf->auxadc_channel), mt->thermal_base + TEMP_ADCEN);
 
 	/* AHB address for auxadc enable (channel 0 immediate mode selected) */
 	writel(auxadc_phys_base + AUXADC_CON1_SET_V,
 	       mt->thermal_base + TEMP_ADCENADDR);
 
 	/* AHB address for auxadc valid bit */
-	writel(auxadc_phys_base + AUXADC_DATA(MT8173_TEMP_AUXADC_CHANNEL),
+	writel(auxadc_phys_base + AUXADC_DATA(conf->auxadc_channel),
 	       mt->thermal_base + TEMP_ADCVALIDADDR);
 
 	/* AHB address for auxadc voltage output */
-	writel(auxadc_phys_base + AUXADC_DATA(MT8173_TEMP_AUXADC_CHANNEL),
+	writel(auxadc_phys_base + AUXADC_DATA(conf->auxadc_channel),
 	       mt->thermal_base + TEMP_ADCVOLTADDR);
 
 	/* read valid & voltage are at the same register */
@@ -407,11 +483,12 @@ static void mtk_thermal_init_bank(struct mtk_thermal *mt, int num,
 	writel(TEMP_ADCWRITECTRL_ADC_MUX_WRITE,
 	       mt->thermal_base + TEMP_ADCWRITECTRL);
 
-	for (i = 0; i < cfg->num_sensors; i++)
-		writel(sensor_mux_values[cfg->sensors[i]],
-		       mt->thermal_base + sensing_points[i].adcpnp);
+	for (i = 0; i < conf->bank_data[num].num_sensors; i++)
+		writel(conf->sensor_mux_values[conf->bank_data[num].sensors[i]],
+		       mt->thermal_base + conf->adcpnp[i]);
 
-	writel((1 << cfg->num_sensors) - 1, mt->thermal_base + TEMP_MONCTL0);
+	writel((1 << conf->bank_data[num].num_sensors) - 1,
+	       mt->thermal_base + TEMP_MONCTL0);
 
 	writel(TEMP_ADCWRITECTRL_ADC_PNP_WRITE |
 	       TEMP_ADCWRITECTRL_ADC_MUX_WRITE,
@@ -442,7 +519,7 @@ static int mtk_thermal_get_calibration_data(struct device *dev,
 
 	/* Start with default values */
 	mt->adc_ge = 512;
-	for (i = 0; i < MT8173_NUM_SENSORS; i++)
+	for (i = 0; i < mt->conf->num_sensors; i++)
 		mt->vts[i] = 260;
 	mt->degc_cali = 40;
 	mt->o_slope = 0;
@@ -486,18 +563,37 @@ out:
 	return ret;
 }
 
+static const struct of_device_id mtk_thermal_of_match[] = {
+	{
+		.compatible = "mediatek,mt8173-thermal",
+		.data = (void *)&mt8173_thermal_data,
+	},
+	{
+		.compatible = "mediatek,mt2701-thermal",
+		.data = (void *)&mt2701_thermal_data,
+	}, {
+	},
+};
+MODULE_DEVICE_TABLE(of, mtk_thermal_of_match);
+
 static int mtk_thermal_probe(struct platform_device *pdev)
 {
 	int ret, i;
 	struct device_node *auxadc, *apmixedsys, *np = pdev->dev.of_node;
 	struct mtk_thermal *mt;
 	struct resource *res;
+	const struct of_device_id *of_id;
 	u64 auxadc_phys_base, apmixed_phys_base;
+	struct thermal_zone_device *tzdev;
 
 	mt = devm_kzalloc(&pdev->dev, sizeof(*mt), GFP_KERNEL);
 	if (!mt)
 		return -ENOMEM;
 
+	of_id = of_match_device(mtk_thermal_of_match, &pdev->dev);
+	if (of_id)
+		mt->conf = (const struct mtk_thermal_data *)of_id->data;
+
 	mt->clk_peri_therm = devm_clk_get(&pdev->dev, "therm");
 	if (IS_ERR(mt->clk_peri_therm))
 		return PTR_ERR(mt->clk_peri_therm);
@@ -565,17 +661,23 @@ static int mtk_thermal_probe(struct platform_device *pdev)
 		goto err_disable_clk_auxadc;
 	}
 
-	for (i = 0; i < MT8173_NUM_ZONES; i++)
+	for (i = 0; i < mt->conf->num_banks; i++)
 		mtk_thermal_init_bank(mt, i, apmixed_phys_base,
 				      auxadc_phys_base);
 
 	platform_set_drvdata(pdev, mt);
 
-	devm_thermal_zone_of_sensor_register(&pdev->dev, 0, mt,
-					     &mtk_thermal_ops);
+	tzdev = devm_thermal_zone_of_sensor_register(&pdev->dev, 0, mt,
+						     &mtk_thermal_ops);
+	if (IS_ERR(tzdev)) {
+		ret = PTR_ERR(tzdev);
+		goto err_disable_clk_peri_therm;
+	}
 
 	return 0;
 
+err_disable_clk_peri_therm:
+	clk_disable_unprepare(mt->clk_peri_therm);
 err_disable_clk_auxadc:
 	clk_disable_unprepare(mt->clk_auxadc);
 
@@ -592,13 +694,6 @@ static int mtk_thermal_remove(struct platform_device *pdev)
 	return 0;
 }
 
-static const struct of_device_id mtk_thermal_of_match[] = {
-	{
-		.compatible = "mediatek,mt8173-thermal",
-	}, {
-	},
-};
-
 static struct platform_driver mtk_thermal_driver = {
 	.probe = mtk_thermal_probe,
 	.remove = mtk_thermal_remove,
@@ -610,6 +705,7 @@ static struct platform_driver mtk_thermal_driver = {
 
 module_platform_driver(mtk_thermal_driver);
 
+MODULE_AUTHOR("Dawei Chien <dawei.chien@mediatek.com>");
 MODULE_AUTHOR("Sascha Hauer <s.hauer@pengutronix.de>");
 MODULE_AUTHOR("Hanyi Wu <hanyi.wu@mediatek.com>");
 MODULE_DESCRIPTION("Mediatek thermal driver");
diff --git a/drivers/thermal/of-thermal.c b/drivers/thermal/of-thermal.c
index b8e509c60848..d04ec3b9e5ff 100644
--- a/drivers/thermal/of-thermal.c
+++ b/drivers/thermal/of-thermal.c
@@ -101,6 +101,17 @@ static int of_thermal_get_temp(struct thermal_zone_device *tz,
 	return data->ops->get_temp(data->sensor_data, temp);
 }
 
+static int of_thermal_set_trips(struct thermal_zone_device *tz,
+				int low, int high)
+{
+	struct __thermal_zone *data = tz->devdata;
+
+	if (!data->ops || !data->ops->set_trips)
+		return -EINVAL;
+
+	return data->ops->set_trips(data->sensor_data, low, high);
+}
+
 /**
  * of_thermal_get_ntrips - function to export number of available trip
  *			   points.
@@ -181,9 +192,6 @@ static int of_thermal_set_emul_temp(struct thermal_zone_device *tz,
 {
 	struct __thermal_zone *data = tz->devdata;
 
-	if (!data->ops || !data->ops->set_emul_temp)
-		return -EINVAL;
-
 	return data->ops->set_emul_temp(data->sensor_data, temp);
 }
 
@@ -191,25 +199,11 @@ static int of_thermal_get_trend(struct thermal_zone_device *tz, int trip,
 				enum thermal_trend *trend)
 {
 	struct __thermal_zone *data = tz->devdata;
-	long dev_trend;
-	int r;
 
 	if (!data->ops->get_trend)
 		return -EINVAL;
 
-	r = data->ops->get_trend(data->sensor_data, &dev_trend);
-	if (r)
-		return r;
-
-	/* TODO: These intervals might have some thresholds, but in core code */
-	if (dev_trend > 0)
-		*trend = THERMAL_TREND_RAISING;
-	else if (dev_trend < 0)
-		*trend = THERMAL_TREND_DROPPING;
-	else
-		*trend = THERMAL_TREND_STABLE;
-
-	return 0;
+	return data->ops->get_trend(data->sensor_data, trip, trend);
 }
 
 static int of_thermal_bind(struct thermal_zone_device *thermal,
@@ -292,7 +286,7 @@ static int of_thermal_set_mode(struct thermal_zone_device *tz,
 	mutex_unlock(&tz->lock);
 
 	data->mode = mode;
-	thermal_zone_device_update(tz);
+	thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
 
 	return 0;
 }
@@ -427,7 +421,17 @@ thermal_zone_of_add_sensor(struct device_node *zone,
 
 	tzd->ops->get_temp = of_thermal_get_temp;
 	tzd->ops->get_trend = of_thermal_get_trend;
-	tzd->ops->set_emul_temp = of_thermal_set_emul_temp;
+
+	/*
+	 * The thermal zone core will calculate the window if they have set the
+	 * optional set_trips pointer.
+	 */
+	if (ops->set_trips)
+		tzd->ops->set_trips = of_thermal_set_trips;
+
+	if (ops->set_emul_temp)
+		tzd->ops->set_emul_temp = of_thermal_set_emul_temp;
+
 	mutex_unlock(&tzd->lock);
 
 	return tzd;
@@ -596,7 +600,7 @@ static int devm_thermal_zone_of_sensor_match(struct device *dev, void *res,
  * Return: On success returns a valid struct thermal_zone_device,
  * otherwise, it returns a corresponding ERR_PTR(). Caller must
  * check the return value with help of IS_ERR() helper.
- * Registered hermal_zone_device device will automatically be
+ * Registered thermal_zone_device device will automatically be
  * released when device is unbounded.
  */
 struct thermal_zone_device *devm_thermal_zone_of_sensor_register(
diff --git a/drivers/thermal/qcom-spmi-temp-alarm.c b/drivers/thermal/qcom-spmi-temp-alarm.c
index f8a3c60bef94..819c6d5d7aa7 100644
--- a/drivers/thermal/qcom-spmi-temp-alarm.c
+++ b/drivers/thermal/qcom-spmi-temp-alarm.c
@@ -150,7 +150,7 @@ static irqreturn_t qpnp_tm_isr(int irq, void *data)
 {
 	struct qpnp_tm_chip *chip = data;
 
-	thermal_zone_device_update(chip->tz_dev);
+	thermal_zone_device_update(chip->tz_dev, THERMAL_EVENT_UNSPECIFIED);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/thermal/qcom/Kconfig b/drivers/thermal/qcom/Kconfig
new file mode 100644
index 000000000000..be32e5abce3c
--- /dev/null
+++ b/drivers/thermal/qcom/Kconfig
@@ -0,0 +1,11 @@
+config QCOM_TSENS
+	tristate "Qualcomm TSENS Temperature Alarm"
+	depends on THERMAL
+	depends on QCOM_QFPROM
+	depends on ARCH_QCOM || COMPILE_TEST
+	help
+	  This enables the thermal sysfs driver for the TSENS device. It shows
+	  up in Sysfs as a thermal zone with multiple trip points. Disabling the
+	  thermal zone device via the mode file results in disabling the sensor.
+	  Also able to set threshold temperature for both hot and cold and update
+	  when a threshold is reached.
diff --git a/drivers/thermal/qcom/Makefile b/drivers/thermal/qcom/Makefile
new file mode 100644
index 000000000000..2cc2193637e7
--- /dev/null
+++ b/drivers/thermal/qcom/Makefile
@@ -0,0 +1,2 @@
+obj-$(CONFIG_QCOM_TSENS)	+= qcom_tsens.o
+qcom_tsens-y			+= tsens.o tsens-common.o tsens-8916.o tsens-8974.o tsens-8960.o tsens-8996.o
diff --git a/drivers/thermal/qcom/tsens-8916.c b/drivers/thermal/qcom/tsens-8916.c
new file mode 100644
index 000000000000..fdf561b8b81d
--- /dev/null
+++ b/drivers/thermal/qcom/tsens-8916.c
@@ -0,0 +1,113 @@
+/*
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/platform_device.h>
+#include "tsens.h"
+
+/* eeprom layout data for 8916 */
+#define BASE0_MASK	0x0000007f
+#define BASE1_MASK	0xfe000000
+#define BASE0_SHIFT	0
+#define BASE1_SHIFT	25
+
+#define S0_P1_MASK	0x00000f80
+#define S1_P1_MASK	0x003e0000
+#define S2_P1_MASK	0xf8000000
+#define S3_P1_MASK	0x000003e0
+#define S4_P1_MASK	0x000f8000
+
+#define S0_P2_MASK	0x0001f000
+#define S1_P2_MASK	0x07c00000
+#define S2_P2_MASK	0x0000001f
+#define S3_P2_MASK	0x00007c00
+#define S4_P2_MASK	0x01f00000
+
+#define S0_P1_SHIFT	7
+#define S1_P1_SHIFT	17
+#define S2_P1_SHIFT	27
+#define S3_P1_SHIFT	5
+#define S4_P1_SHIFT	15
+
+#define S0_P2_SHIFT	12
+#define S1_P2_SHIFT	22
+#define S2_P2_SHIFT	0
+#define S3_P2_SHIFT	10
+#define S4_P2_SHIFT	20
+
+#define CAL_SEL_MASK	0xe0000000
+#define CAL_SEL_SHIFT	29
+
+static int calibrate_8916(struct tsens_device *tmdev)
+{
+	int base0 = 0, base1 = 0, i;
+	u32 p1[5], p2[5];
+	int mode = 0;
+	u32 *qfprom_cdata, *qfprom_csel;
+
+	qfprom_cdata = (u32 *)qfprom_read(tmdev->dev, "calib");
+	if (IS_ERR(qfprom_cdata))
+		return PTR_ERR(qfprom_cdata);
+
+	qfprom_csel = (u32 *)qfprom_read(tmdev->dev, "calib_sel");
+	if (IS_ERR(qfprom_csel))
+		return PTR_ERR(qfprom_csel);
+
+	mode = (qfprom_csel[0] & CAL_SEL_MASK) >> CAL_SEL_SHIFT;
+	dev_dbg(tmdev->dev, "calibration mode is %d\n", mode);
+
+	switch (mode) {
+	case TWO_PT_CALIB:
+		base1 = (qfprom_cdata[1] & BASE1_MASK) >> BASE1_SHIFT;
+		p2[0] = (qfprom_cdata[0] & S0_P2_MASK) >> S0_P2_SHIFT;
+		p2[1] = (qfprom_cdata[0] & S1_P2_MASK) >> S1_P2_SHIFT;
+		p2[2] = (qfprom_cdata[1] & S2_P2_MASK) >> S2_P2_SHIFT;
+		p2[3] = (qfprom_cdata[1] & S3_P2_MASK) >> S3_P2_SHIFT;
+		p2[4] = (qfprom_cdata[1] & S4_P2_MASK) >> S4_P2_SHIFT;
+		for (i = 0; i < tmdev->num_sensors; i++)
+			p2[i] = ((base1 + p2[i]) << 3);
+		/* Fall through */
+	case ONE_PT_CALIB2:
+		base0 = (qfprom_cdata[0] & BASE0_MASK);
+		p1[0] = (qfprom_cdata[0] & S0_P1_MASK) >> S0_P1_SHIFT;
+		p1[1] = (qfprom_cdata[0] & S1_P1_MASK) >> S1_P1_SHIFT;
+		p1[2] = (qfprom_cdata[0] & S2_P1_MASK) >> S2_P1_SHIFT;
+		p1[3] = (qfprom_cdata[1] & S3_P1_MASK) >> S3_P1_SHIFT;
+		p1[4] = (qfprom_cdata[1] & S4_P1_MASK) >> S4_P1_SHIFT;
+		for (i = 0; i < tmdev->num_sensors; i++)
+			p1[i] = (((base0) + p1[i]) << 3);
+		break;
+	default:
+		for (i = 0; i < tmdev->num_sensors; i++) {
+			p1[i] = 500;
+			p2[i] = 780;
+		}
+		break;
+	}
+
+	compute_intercept_slope(tmdev, p1, p2, mode);
+
+	return 0;
+}
+
+static const struct tsens_ops ops_8916 = {
+	.init		= init_common,
+	.calibrate	= calibrate_8916,
+	.get_temp	= get_temp_common,
+};
+
+const struct tsens_data data_8916 = {
+	.num_sensors	= 5,
+	.ops		= &ops_8916,
+	.hw_ids		= (unsigned int []){0, 1, 2, 4, 5 },
+};
diff --git a/drivers/thermal/qcom/tsens-8960.c b/drivers/thermal/qcom/tsens-8960.c
new file mode 100644
index 000000000000..0451277d3a8f
--- /dev/null
+++ b/drivers/thermal/qcom/tsens-8960.c
@@ -0,0 +1,292 @@
+/*
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/platform_device.h>
+#include <linux/delay.h>
+#include <linux/bitops.h>
+#include <linux/regmap.h>
+#include <linux/thermal.h>
+#include "tsens.h"
+
+#define CAL_MDEGC		30000
+
+#define CONFIG_ADDR		0x3640
+#define CONFIG_ADDR_8660	0x3620
+/* CONFIG_ADDR bitmasks */
+#define CONFIG			0x9b
+#define CONFIG_MASK		0xf
+#define CONFIG_8660		1
+#define CONFIG_SHIFT_8660	28
+#define CONFIG_MASK_8660	(3 << CONFIG_SHIFT_8660)
+
+#define STATUS_CNTL_ADDR_8064	0x3660
+#define CNTL_ADDR		0x3620
+/* CNTL_ADDR bitmasks */
+#define EN			BIT(0)
+#define SW_RST			BIT(1)
+#define SENSOR0_EN		BIT(3)
+#define SLP_CLK_ENA		BIT(26)
+#define SLP_CLK_ENA_8660	BIT(24)
+#define MEASURE_PERIOD		1
+#define SENSOR0_SHIFT		3
+
+/* INT_STATUS_ADDR bitmasks */
+#define MIN_STATUS_MASK		BIT(0)
+#define LOWER_STATUS_CLR	BIT(1)
+#define UPPER_STATUS_CLR	BIT(2)
+#define MAX_STATUS_MASK		BIT(3)
+
+#define THRESHOLD_ADDR		0x3624
+/* THRESHOLD_ADDR bitmasks */
+#define THRESHOLD_MAX_LIMIT_SHIFT	24
+#define THRESHOLD_MIN_LIMIT_SHIFT	16
+#define THRESHOLD_UPPER_LIMIT_SHIFT	8
+#define THRESHOLD_LOWER_LIMIT_SHIFT	0
+
+/* Initial temperature threshold values */
+#define LOWER_LIMIT_TH		0x50
+#define UPPER_LIMIT_TH		0xdf
+#define MIN_LIMIT_TH		0x0
+#define MAX_LIMIT_TH		0xff
+
+#define S0_STATUS_ADDR		0x3628
+#define INT_STATUS_ADDR		0x363c
+#define TRDY_MASK		BIT(7)
+#define TIMEOUT_US		100
+
+static int suspend_8960(struct tsens_device *tmdev)
+{
+	int ret;
+	unsigned int mask;
+	struct regmap *map = tmdev->map;
+
+	ret = regmap_read(map, THRESHOLD_ADDR, &tmdev->ctx.threshold);
+	if (ret)
+		return ret;
+
+	ret = regmap_read(map, CNTL_ADDR, &tmdev->ctx.control);
+	if (ret)
+		return ret;
+
+	if (tmdev->num_sensors > 1)
+		mask = SLP_CLK_ENA | EN;
+	else
+		mask = SLP_CLK_ENA_8660 | EN;
+
+	ret = regmap_update_bits(map, CNTL_ADDR, mask, 0);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int resume_8960(struct tsens_device *tmdev)
+{
+	int ret;
+	struct regmap *map = tmdev->map;
+
+	ret = regmap_update_bits(map, CNTL_ADDR, SW_RST, SW_RST);
+	if (ret)
+		return ret;
+
+	/*
+	 * Separate CONFIG restore is not needed only for 8660 as
+	 * config is part of CTRL Addr and its restored as such
+	 */
+	if (tmdev->num_sensors > 1) {
+		ret = regmap_update_bits(map, CONFIG_ADDR, CONFIG_MASK, CONFIG);
+		if (ret)
+			return ret;
+	}
+
+	ret = regmap_write(map, THRESHOLD_ADDR, tmdev->ctx.threshold);
+	if (ret)
+		return ret;
+
+	ret = regmap_write(map, CNTL_ADDR, tmdev->ctx.control);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int enable_8960(struct tsens_device *tmdev, int id)
+{
+	int ret;
+	u32 reg, mask;
+
+	ret = regmap_read(tmdev->map, CNTL_ADDR, &reg);
+	if (ret)
+		return ret;
+
+	mask = BIT(id + SENSOR0_SHIFT);
+	ret = regmap_write(tmdev->map, CNTL_ADDR, reg | SW_RST);
+	if (ret)
+		return ret;
+
+	if (tmdev->num_sensors > 1)
+		reg |= mask | SLP_CLK_ENA | EN;
+	else
+		reg |= mask | SLP_CLK_ENA_8660 | EN;
+
+	ret = regmap_write(tmdev->map, CNTL_ADDR, reg);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static void disable_8960(struct tsens_device *tmdev)
+{
+	int ret;
+	u32 reg_cntl;
+	u32 mask;
+
+	mask = GENMASK(tmdev->num_sensors - 1, 0);
+	mask <<= SENSOR0_SHIFT;
+	mask |= EN;
+
+	ret = regmap_read(tmdev->map, CNTL_ADDR, &reg_cntl);
+	if (ret)
+		return;
+
+	reg_cntl &= ~mask;
+
+	if (tmdev->num_sensors > 1)
+		reg_cntl &= ~SLP_CLK_ENA;
+	else
+		reg_cntl &= ~SLP_CLK_ENA_8660;
+
+	regmap_write(tmdev->map, CNTL_ADDR, reg_cntl);
+}
+
+static int init_8960(struct tsens_device *tmdev)
+{
+	int ret, i;
+	u32 reg_cntl;
+
+	tmdev->map = dev_get_regmap(tmdev->dev, NULL);
+	if (!tmdev->map)
+		return -ENODEV;
+
+	/*
+	 * The status registers for each sensor are discontiguous
+	 * because some SoCs have 5 sensors while others have more
+	 * but the control registers stay in the same place, i.e
+	 * directly after the first 5 status registers.
+	 */
+	for (i = 0; i < tmdev->num_sensors; i++) {
+		if (i >= 5)
+			tmdev->sensor[i].status = S0_STATUS_ADDR + 40;
+		tmdev->sensor[i].status += i * 4;
+	}
+
+	reg_cntl = SW_RST;
+	ret = regmap_update_bits(tmdev->map, CNTL_ADDR, SW_RST, reg_cntl);
+	if (ret)
+		return ret;
+
+	if (tmdev->num_sensors > 1) {
+		reg_cntl |= SLP_CLK_ENA | (MEASURE_PERIOD << 18);
+		reg_cntl &= ~SW_RST;
+		ret = regmap_update_bits(tmdev->map, CONFIG_ADDR,
+					 CONFIG_MASK, CONFIG);
+	} else {
+		reg_cntl |= SLP_CLK_ENA_8660 | (MEASURE_PERIOD << 16);
+		reg_cntl &= ~CONFIG_MASK_8660;
+		reg_cntl |= CONFIG_8660 << CONFIG_SHIFT_8660;
+	}
+
+	reg_cntl |= GENMASK(tmdev->num_sensors - 1, 0) << SENSOR0_SHIFT;
+	ret = regmap_write(tmdev->map, CNTL_ADDR, reg_cntl);
+	if (ret)
+		return ret;
+
+	reg_cntl |= EN;
+	ret = regmap_write(tmdev->map, CNTL_ADDR, reg_cntl);
+	if (ret)
+		return ret;
+
+	return 0;
+}
+
+static int calibrate_8960(struct tsens_device *tmdev)
+{
+	int i;
+	char *data;
+
+	ssize_t num_read = tmdev->num_sensors;
+	struct tsens_sensor *s = tmdev->sensor;
+
+	data = qfprom_read(tmdev->dev, "calib");
+	if (IS_ERR(data))
+		data = qfprom_read(tmdev->dev, "calib_backup");
+	if (IS_ERR(data))
+		return PTR_ERR(data);
+
+	for (i = 0; i < num_read; i++, s++)
+		s->offset = data[i];
+
+	return 0;
+}
+
+/* Temperature on y axis and ADC-code on x-axis */
+static inline int code_to_mdegC(u32 adc_code, const struct tsens_sensor *s)
+{
+	int slope, offset;
+
+	slope = thermal_zone_get_slope(s->tzd);
+	offset = CAL_MDEGC - slope * s->offset;
+
+	return adc_code * slope + offset;
+}
+
+static int get_temp_8960(struct tsens_device *tmdev, int id, int *temp)
+{
+	int ret;
+	u32 code, trdy;
+	const struct tsens_sensor *s = &tmdev->sensor[id];
+	unsigned long timeout;
+
+	timeout = jiffies + usecs_to_jiffies(TIMEOUT_US);
+	do {
+		ret = regmap_read(tmdev->map, INT_STATUS_ADDR, &trdy);
+		if (ret)
+			return ret;
+		if (!(trdy & TRDY_MASK))
+			continue;
+		ret = regmap_read(tmdev->map, s->status, &code);
+		if (ret)
+			return ret;
+		*temp = code_to_mdegC(code, s);
+		return 0;
+	} while (time_before(jiffies, timeout));
+
+	return -ETIMEDOUT;
+}
+
+static const struct tsens_ops ops_8960 = {
+	.init		= init_8960,
+	.calibrate	= calibrate_8960,
+	.get_temp	= get_temp_8960,
+	.enable		= enable_8960,
+	.disable	= disable_8960,
+	.suspend	= suspend_8960,
+	.resume		= resume_8960,
+};
+
+const struct tsens_data data_8960 = {
+	.num_sensors	= 11,
+	.ops		= &ops_8960,
+};
diff --git a/drivers/thermal/qcom/tsens-8974.c b/drivers/thermal/qcom/tsens-8974.c
new file mode 100644
index 000000000000..9baf77e8cbe3
--- /dev/null
+++ b/drivers/thermal/qcom/tsens-8974.c
@@ -0,0 +1,244 @@
+/*
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/platform_device.h>
+#include "tsens.h"
+
+/* eeprom layout data for 8974 */
+#define BASE1_MASK		0xff
+#define S0_P1_MASK		0x3f00
+#define S1_P1_MASK		0xfc000
+#define S2_P1_MASK		0x3f00000
+#define S3_P1_MASK		0xfc000000
+#define S4_P1_MASK		0x3f
+#define S5_P1_MASK		0xfc0
+#define S6_P1_MASK		0x3f000
+#define S7_P1_MASK		0xfc0000
+#define S8_P1_MASK		0x3f000000
+#define S8_P1_MASK_BKP		0x3f
+#define S9_P1_MASK		0x3f
+#define S9_P1_MASK_BKP		0xfc0
+#define S10_P1_MASK		0xfc0
+#define S10_P1_MASK_BKP		0x3f000
+#define CAL_SEL_0_1		0xc0000000
+#define CAL_SEL_2		0x40000000
+#define CAL_SEL_SHIFT		30
+#define CAL_SEL_SHIFT_2		28
+
+#define S0_P1_SHIFT		8
+#define S1_P1_SHIFT		14
+#define S2_P1_SHIFT		20
+#define S3_P1_SHIFT		26
+#define S5_P1_SHIFT		6
+#define S6_P1_SHIFT		12
+#define S7_P1_SHIFT		18
+#define S8_P1_SHIFT		24
+#define S9_P1_BKP_SHIFT		6
+#define S10_P1_SHIFT		6
+#define S10_P1_BKP_SHIFT	12
+
+#define BASE2_SHIFT		12
+#define BASE2_BKP_SHIFT		18
+#define S0_P2_SHIFT		20
+#define S0_P2_BKP_SHIFT		26
+#define S1_P2_SHIFT		26
+#define S2_P2_BKP_SHIFT		6
+#define S3_P2_SHIFT		6
+#define S3_P2_BKP_SHIFT		12
+#define S4_P2_SHIFT		12
+#define S4_P2_BKP_SHIFT		18
+#define S5_P2_SHIFT		18
+#define S5_P2_BKP_SHIFT		24
+#define S6_P2_SHIFT		24
+#define S7_P2_BKP_SHIFT		6
+#define S8_P2_SHIFT		6
+#define S8_P2_BKP_SHIFT		12
+#define S9_P2_SHIFT		12
+#define S9_P2_BKP_SHIFT		18
+#define S10_P2_SHIFT		18
+#define S10_P2_BKP_SHIFT	24
+
+#define BASE2_MASK		0xff000
+#define BASE2_BKP_MASK		0xfc0000
+#define S0_P2_MASK		0x3f00000
+#define S0_P2_BKP_MASK		0xfc000000
+#define S1_P2_MASK		0xfc000000
+#define S1_P2_BKP_MASK		0x3f
+#define S2_P2_MASK		0x3f
+#define S2_P2_BKP_MASK		0xfc0
+#define S3_P2_MASK		0xfc0
+#define S3_P2_BKP_MASK		0x3f000
+#define S4_P2_MASK		0x3f000
+#define S4_P2_BKP_MASK		0xfc0000
+#define S5_P2_MASK		0xfc0000
+#define S5_P2_BKP_MASK		0x3f000000
+#define S6_P2_MASK		0x3f000000
+#define S6_P2_BKP_MASK		0x3f
+#define S7_P2_MASK		0x3f
+#define S7_P2_BKP_MASK		0xfc0
+#define S8_P2_MASK		0xfc0
+#define S8_P2_BKP_MASK		0x3f000
+#define S9_P2_MASK		0x3f000
+#define S9_P2_BKP_MASK		0xfc0000
+#define S10_P2_MASK		0xfc0000
+#define S10_P2_BKP_MASK		0x3f000000
+
+#define BKP_SEL			0x3
+#define BKP_REDUN_SEL		0xe0000000
+#define BKP_REDUN_SHIFT		29
+
+#define BIT_APPEND		0x3
+
+static int calibrate_8974(struct tsens_device *tmdev)
+{
+	int base1 = 0, base2 = 0, i;
+	u32 p1[11], p2[11];
+	int mode = 0;
+	u32 *calib, *bkp;
+	u32 calib_redun_sel;
+
+	calib = (u32 *)qfprom_read(tmdev->dev, "calib");
+	if (IS_ERR(calib))
+		return PTR_ERR(calib);
+
+	bkp = (u32 *)qfprom_read(tmdev->dev, "calib_backup");
+	if (IS_ERR(bkp))
+		return PTR_ERR(bkp);
+
+	calib_redun_sel =  bkp[1] & BKP_REDUN_SEL;
+	calib_redun_sel >>= BKP_REDUN_SHIFT;
+
+	if (calib_redun_sel == BKP_SEL) {
+		mode = (calib[4] & CAL_SEL_0_1) >> CAL_SEL_SHIFT;
+		mode |= (calib[5] & CAL_SEL_2) >> CAL_SEL_SHIFT_2;
+
+		switch (mode) {
+		case TWO_PT_CALIB:
+			base2 = (bkp[2] & BASE2_BKP_MASK) >> BASE2_BKP_SHIFT;
+			p2[0] = (bkp[2] & S0_P2_BKP_MASK) >> S0_P2_BKP_SHIFT;
+			p2[1] = (bkp[3] & S1_P2_BKP_MASK);
+			p2[2] = (bkp[3] & S2_P2_BKP_MASK) >> S2_P2_BKP_SHIFT;
+			p2[3] = (bkp[3] & S3_P2_BKP_MASK) >> S3_P2_BKP_SHIFT;
+			p2[4] = (bkp[3] & S4_P2_BKP_MASK) >> S4_P2_BKP_SHIFT;
+			p2[5] = (calib[4] & S5_P2_BKP_MASK) >> S5_P2_BKP_SHIFT;
+			p2[6] = (calib[5] & S6_P2_BKP_MASK);
+			p2[7] = (calib[5] & S7_P2_BKP_MASK) >> S7_P2_BKP_SHIFT;
+			p2[8] = (calib[5] & S8_P2_BKP_MASK) >> S8_P2_BKP_SHIFT;
+			p2[9] = (calib[5] & S9_P2_BKP_MASK) >> S9_P2_BKP_SHIFT;
+			p2[10] = (calib[5] & S10_P2_BKP_MASK) >> S10_P2_BKP_SHIFT;
+			/* Fall through */
+		case ONE_PT_CALIB:
+		case ONE_PT_CALIB2:
+			base1 = bkp[0] & BASE1_MASK;
+			p1[0] = (bkp[0] & S0_P1_MASK) >> S0_P1_SHIFT;
+			p1[1] = (bkp[0] & S1_P1_MASK) >> S1_P1_SHIFT;
+			p1[2] = (bkp[0] & S2_P1_MASK) >> S2_P1_SHIFT;
+			p1[3] = (bkp[0] & S3_P1_MASK) >> S3_P1_SHIFT;
+			p1[4] = (bkp[1] & S4_P1_MASK);
+			p1[5] = (bkp[1] & S5_P1_MASK) >> S5_P1_SHIFT;
+			p1[6] = (bkp[1] & S6_P1_MASK) >> S6_P1_SHIFT;
+			p1[7] = (bkp[1] & S7_P1_MASK) >> S7_P1_SHIFT;
+			p1[8] = (bkp[2] & S8_P1_MASK_BKP) >> S8_P1_SHIFT;
+			p1[9] = (bkp[2] & S9_P1_MASK_BKP) >> S9_P1_BKP_SHIFT;
+			p1[10] = (bkp[2] & S10_P1_MASK_BKP) >> S10_P1_BKP_SHIFT;
+			break;
+		}
+	} else {
+		mode = (calib[1] & CAL_SEL_0_1) >> CAL_SEL_SHIFT;
+		mode |= (calib[3] & CAL_SEL_2) >> CAL_SEL_SHIFT_2;
+
+		switch (mode) {
+		case TWO_PT_CALIB:
+			base2 = (calib[2] & BASE2_MASK) >> BASE2_SHIFT;
+			p2[0] = (calib[2] & S0_P2_MASK) >> S0_P2_SHIFT;
+			p2[1] = (calib[2] & S1_P2_MASK) >> S1_P2_SHIFT;
+			p2[2] = (calib[3] & S2_P2_MASK);
+			p2[3] = (calib[3] & S3_P2_MASK) >> S3_P2_SHIFT;
+			p2[4] = (calib[3] & S4_P2_MASK) >> S4_P2_SHIFT;
+			p2[5] = (calib[3] & S5_P2_MASK) >> S5_P2_SHIFT;
+			p2[6] = (calib[3] & S6_P2_MASK) >> S6_P2_SHIFT;
+			p2[7] = (calib[4] & S7_P2_MASK);
+			p2[8] = (calib[4] & S8_P2_MASK) >> S8_P2_SHIFT;
+			p2[9] = (calib[4] & S9_P2_MASK) >> S9_P2_SHIFT;
+			p2[10] = (calib[4] & S10_P2_MASK) >> S10_P2_SHIFT;
+			/* Fall through */
+		case ONE_PT_CALIB:
+		case ONE_PT_CALIB2:
+			base1 = calib[0] & BASE1_MASK;
+			p1[0] = (calib[0] & S0_P1_MASK) >> S0_P1_SHIFT;
+			p1[1] = (calib[0] & S1_P1_MASK) >> S1_P1_SHIFT;
+			p1[2] = (calib[0] & S2_P1_MASK) >> S2_P1_SHIFT;
+			p1[3] = (calib[0] & S3_P1_MASK) >> S3_P1_SHIFT;
+			p1[4] = (calib[1] & S4_P1_MASK);
+			p1[5] = (calib[1] & S5_P1_MASK) >> S5_P1_SHIFT;
+			p1[6] = (calib[1] & S6_P1_MASK) >> S6_P1_SHIFT;
+			p1[7] = (calib[1] & S7_P1_MASK) >> S7_P1_SHIFT;
+			p1[8] = (calib[1] & S8_P1_MASK) >> S8_P1_SHIFT;
+			p1[9] = (calib[2] & S9_P1_MASK);
+			p1[10] = (calib[2] & S10_P1_MASK) >> S10_P1_SHIFT;
+			break;
+		}
+	}
+
+	switch (mode) {
+	case ONE_PT_CALIB:
+		for (i = 0; i < tmdev->num_sensors; i++)
+			p1[i] += (base1 << 2) | BIT_APPEND;
+		break;
+	case TWO_PT_CALIB:
+		for (i = 0; i < tmdev->num_sensors; i++) {
+			p2[i] += base2;
+			p2[i] <<= 2;
+			p2[i] |= BIT_APPEND;
+		}
+		/* Fall through */
+	case ONE_PT_CALIB2:
+		for (i = 0; i < tmdev->num_sensors; i++) {
+			p1[i] += base1;
+			p1[i] <<= 2;
+			p1[i] |= BIT_APPEND;
+		}
+		break;
+	default:
+		for (i = 0; i < tmdev->num_sensors; i++)
+			p2[i] = 780;
+		p1[0] = 502;
+		p1[1] = 509;
+		p1[2] = 503;
+		p1[3] = 509;
+		p1[4] = 505;
+		p1[5] = 509;
+		p1[6] = 507;
+		p1[7] = 510;
+		p1[8] = 508;
+		p1[9] = 509;
+		p1[10] = 508;
+		break;
+	}
+
+	compute_intercept_slope(tmdev, p1, p2, mode);
+
+	return 0;
+}
+
+static const struct tsens_ops ops_8974 = {
+	.init		= init_common,
+	.calibrate	= calibrate_8974,
+	.get_temp	= get_temp_common,
+};
+
+const struct tsens_data data_8974 = {
+	.num_sensors	= 11,
+	.ops		= &ops_8974,
+};
diff --git a/drivers/thermal/qcom/tsens-8996.c b/drivers/thermal/qcom/tsens-8996.c
new file mode 100644
index 000000000000..e1f77818d8fa
--- /dev/null
+++ b/drivers/thermal/qcom/tsens-8996.c
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include "tsens.h"
+
+#define STATUS_OFFSET	0x10a0
+#define LAST_TEMP_MASK	0xfff
+#define STATUS_VALID_BIT	BIT(21)
+#define CODE_SIGN_BIT		BIT(11)
+
+static int get_temp_8996(struct tsens_device *tmdev, int id, int *temp)
+{
+	struct tsens_sensor *s = &tmdev->sensor[id];
+	u32 code;
+	unsigned int sensor_addr;
+	int last_temp = 0, last_temp2 = 0, last_temp3 = 0, ret;
+
+	sensor_addr = STATUS_OFFSET + s->hw_id * 4;
+	ret = regmap_read(tmdev->map, sensor_addr, &code);
+	if (ret)
+		return ret;
+	last_temp = code & LAST_TEMP_MASK;
+	if (code & STATUS_VALID_BIT)
+		goto done;
+
+	/* Try a second time */
+	ret = regmap_read(tmdev->map, sensor_addr, &code);
+	if (ret)
+		return ret;
+	if (code & STATUS_VALID_BIT) {
+		last_temp = code & LAST_TEMP_MASK;
+		goto done;
+	} else {
+		last_temp2 = code & LAST_TEMP_MASK;
+	}
+
+	/* Try a third/last time */
+	ret = regmap_read(tmdev->map, sensor_addr, &code);
+	if (ret)
+		return ret;
+	if (code & STATUS_VALID_BIT) {
+		last_temp = code & LAST_TEMP_MASK;
+		goto done;
+	} else {
+		last_temp3 = code & LAST_TEMP_MASK;
+	}
+
+	if (last_temp == last_temp2)
+		last_temp = last_temp2;
+	else if (last_temp2 == last_temp3)
+		last_temp = last_temp3;
+done:
+	/* Code sign bit is the sign extension for a negative value */
+	if (last_temp & CODE_SIGN_BIT)
+		last_temp |= ~CODE_SIGN_BIT;
+
+	/* Temperatures are in deciCelicius */
+	*temp = last_temp * 100;
+
+	return 0;
+}
+
+static const struct tsens_ops ops_8996 = {
+	.init		= init_common,
+	.get_temp	= get_temp_8996,
+};
+
+const struct tsens_data data_8996 = {
+	.num_sensors	= 13,
+	.ops		= &ops_8996,
+};
diff --git a/drivers/thermal/qcom/tsens-common.c b/drivers/thermal/qcom/tsens-common.c
new file mode 100644
index 000000000000..b1449ad67fc0
--- /dev/null
+++ b/drivers/thermal/qcom/tsens-common.c
@@ -0,0 +1,141 @@
+/*
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/nvmem-consumer.h>
+#include <linux/of_address.h>
+#include <linux/platform_device.h>
+#include <linux/regmap.h>
+#include "tsens.h"
+
+#define S0_ST_ADDR		0x1030
+#define SN_ADDR_OFFSET		0x4
+#define SN_ST_TEMP_MASK		0x3ff
+#define CAL_DEGC_PT1		30
+#define CAL_DEGC_PT2		120
+#define SLOPE_FACTOR		1000
+#define SLOPE_DEFAULT		3200
+
+char *qfprom_read(struct device *dev, const char *cname)
+{
+	struct nvmem_cell *cell;
+	ssize_t data;
+	char *ret;
+
+	cell = nvmem_cell_get(dev, cname);
+	if (IS_ERR(cell))
+		return ERR_CAST(cell);
+
+	ret = nvmem_cell_read(cell, &data);
+	nvmem_cell_put(cell);
+
+	return ret;
+}
+
+/*
+ * Use this function on devices where slope and offset calculations
+ * depend on calibration data read from qfprom. On others the slope
+ * and offset values are derived from tz->tzp->slope and tz->tzp->offset
+ * resp.
+ */
+void compute_intercept_slope(struct tsens_device *tmdev, u32 *p1,
+			     u32 *p2, u32 mode)
+{
+	int i;
+	int num, den;
+
+	for (i = 0; i < tmdev->num_sensors; i++) {
+		dev_dbg(tmdev->dev,
+			"sensor%d - data_point1:%#x data_point2:%#x\n",
+			i, p1[i], p2[i]);
+
+		tmdev->sensor[i].slope = SLOPE_DEFAULT;
+		if (mode == TWO_PT_CALIB) {
+			/*
+			 * slope (m) = adc_code2 - adc_code1 (y2 - y1)/
+			 *	temp_120_degc - temp_30_degc (x2 - x1)
+			 */
+			num = p2[i] - p1[i];
+			num *= SLOPE_FACTOR;
+			den = CAL_DEGC_PT2 - CAL_DEGC_PT1;
+			tmdev->sensor[i].slope = num / den;
+		}
+
+		tmdev->sensor[i].offset = (p1[i] * SLOPE_FACTOR) -
+				(CAL_DEGC_PT1 *
+				tmdev->sensor[i].slope);
+		dev_dbg(tmdev->dev, "offset:%d\n", tmdev->sensor[i].offset);
+	}
+}
+
+static inline int code_to_degc(u32 adc_code, const struct tsens_sensor *s)
+{
+	int degc, num, den;
+
+	num = (adc_code * SLOPE_FACTOR) - s->offset;
+	den = s->slope;
+
+	if (num > 0)
+		degc = num + (den / 2);
+	else if (num < 0)
+		degc = num - (den / 2);
+	else
+		degc = num;
+
+	degc /= den;
+
+	return degc;
+}
+
+int get_temp_common(struct tsens_device *tmdev, int id, int *temp)
+{
+	struct tsens_sensor *s = &tmdev->sensor[id];
+	u32 code;
+	unsigned int sensor_addr;
+	int last_temp = 0, ret;
+
+	sensor_addr = S0_ST_ADDR + s->hw_id * SN_ADDR_OFFSET;
+	ret = regmap_read(tmdev->map, sensor_addr, &code);
+	if (ret)
+		return ret;
+	last_temp = code & SN_ST_TEMP_MASK;
+
+	*temp = code_to_degc(last_temp, s) * 1000;
+
+	return 0;
+}
+
+static const struct regmap_config tsens_config = {
+	.reg_bits	= 32,
+	.val_bits	= 32,
+	.reg_stride	= 4,
+};
+
+int __init init_common(struct tsens_device *tmdev)
+{
+	void __iomem *base;
+
+	base = of_iomap(tmdev->dev->of_node, 0);
+	if (!base)
+		return -EINVAL;
+
+	tmdev->map = devm_regmap_init_mmio(tmdev->dev, base, &tsens_config);
+	if (IS_ERR(tmdev->map)) {
+		iounmap(base);
+		return PTR_ERR(tmdev->map);
+	}
+
+	return 0;
+}
diff --git a/drivers/thermal/qcom/tsens.c b/drivers/thermal/qcom/tsens.c
new file mode 100644
index 000000000000..3f9fe6aa51cc
--- /dev/null
+++ b/drivers/thermal/qcom/tsens.c
@@ -0,0 +1,200 @@
+/*
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 and
+ * only version 2 as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ */
+
+#include <linux/err.h>
+#include <linux/module.h>
+#include <linux/of.h>
+#include <linux/platform_device.h>
+#include <linux/pm.h>
+#include <linux/slab.h>
+#include <linux/thermal.h>
+#include "tsens.h"
+
+static int tsens_get_temp(void *data, int *temp)
+{
+	const struct tsens_sensor *s = data;
+	struct tsens_device *tmdev = s->tmdev;
+
+	return tmdev->ops->get_temp(tmdev, s->id, temp);
+}
+
+static int tsens_get_trend(void *p, int trip, enum thermal_trend *trend)
+{
+	const struct tsens_sensor *s = p;
+	struct tsens_device *tmdev = s->tmdev;
+
+	if (tmdev->ops->get_trend)
+		return  tmdev->ops->get_trend(tmdev, s->id, trend);
+
+	return -ENOTSUPP;
+}
+
+static int  __maybe_unused tsens_suspend(struct device *dev)
+{
+	struct tsens_device *tmdev = dev_get_drvdata(dev);
+
+	if (tmdev->ops && tmdev->ops->suspend)
+		return tmdev->ops->suspend(tmdev);
+
+	return 0;
+}
+
+static int __maybe_unused tsens_resume(struct device *dev)
+{
+	struct tsens_device *tmdev = dev_get_drvdata(dev);
+
+	if (tmdev->ops && tmdev->ops->resume)
+		return tmdev->ops->resume(tmdev);
+
+	return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(tsens_pm_ops, tsens_suspend, tsens_resume);
+
+static const struct of_device_id tsens_table[] = {
+	{
+		.compatible = "qcom,msm8916-tsens",
+		.data = &data_8916,
+	}, {
+		.compatible = "qcom,msm8974-tsens",
+		.data = &data_8974,
+	}, {
+		.compatible = "qcom,msm8996-tsens",
+		.data = &data_8996,
+	},
+	{}
+};
+MODULE_DEVICE_TABLE(of, tsens_table);
+
+static const struct thermal_zone_of_device_ops tsens_of_ops = {
+	.get_temp = tsens_get_temp,
+	.get_trend = tsens_get_trend,
+};
+
+static int tsens_register(struct tsens_device *tmdev)
+{
+	int i;
+	struct thermal_zone_device *tzd;
+	u32 *hw_id, n = tmdev->num_sensors;
+
+	hw_id = devm_kcalloc(tmdev->dev, n, sizeof(u32), GFP_KERNEL);
+	if (!hw_id)
+		return -ENOMEM;
+
+	for (i = 0;  i < tmdev->num_sensors; i++) {
+		tmdev->sensor[i].tmdev = tmdev;
+		tmdev->sensor[i].id = i;
+		tzd = devm_thermal_zone_of_sensor_register(tmdev->dev, i,
+							   &tmdev->sensor[i],
+							   &tsens_of_ops);
+		if (IS_ERR(tzd))
+			continue;
+		tmdev->sensor[i].tzd = tzd;
+		if (tmdev->ops->enable)
+			tmdev->ops->enable(tmdev, i);
+	}
+	return 0;
+}
+
+static int tsens_probe(struct platform_device *pdev)
+{
+	int ret, i;
+	struct device *dev;
+	struct device_node *np;
+	struct tsens_sensor *s;
+	struct tsens_device *tmdev;
+	const struct tsens_data *data;
+	const struct of_device_id *id;
+
+	if (pdev->dev.of_node)
+		dev = &pdev->dev;
+	else
+		dev = pdev->dev.parent;
+
+	np = dev->of_node;
+
+	id = of_match_node(tsens_table, np);
+	if (id)
+		data = id->data;
+	else
+		data = &data_8960;
+
+	if (data->num_sensors <= 0) {
+		dev_err(dev, "invalid number of sensors\n");
+		return -EINVAL;
+	}
+
+	tmdev = devm_kzalloc(dev, sizeof(*tmdev) +
+			     data->num_sensors * sizeof(*s), GFP_KERNEL);
+	if (!tmdev)
+		return -ENOMEM;
+
+	tmdev->dev = dev;
+	tmdev->num_sensors = data->num_sensors;
+	tmdev->ops = data->ops;
+	for (i = 0;  i < tmdev->num_sensors; i++) {
+		if (data->hw_ids)
+			tmdev->sensor[i].hw_id = data->hw_ids[i];
+		else
+			tmdev->sensor[i].hw_id = i;
+	}
+
+	if (!tmdev->ops || !tmdev->ops->init || !tmdev->ops->get_temp)
+		return -EINVAL;
+
+	ret = tmdev->ops->init(tmdev);
+	if (ret < 0) {
+		dev_err(dev, "tsens init failed\n");
+		return ret;
+	}
+
+	if (tmdev->ops->calibrate) {
+		ret = tmdev->ops->calibrate(tmdev);
+		if (ret < 0) {
+			dev_err(dev, "tsens calibration failed\n");
+			return ret;
+		}
+	}
+
+	ret = tsens_register(tmdev);
+
+	platform_set_drvdata(pdev, tmdev);
+
+	return ret;
+}
+
+static int tsens_remove(struct platform_device *pdev)
+{
+	struct tsens_device *tmdev = platform_get_drvdata(pdev);
+
+	if (tmdev->ops->disable)
+		tmdev->ops->disable(tmdev);
+
+	return 0;
+}
+
+static struct platform_driver tsens_driver = {
+	.probe = tsens_probe,
+	.remove = tsens_remove,
+	.driver = {
+		.name = "qcom-tsens",
+		.pm	= &tsens_pm_ops,
+		.of_match_table = tsens_table,
+	},
+};
+module_platform_driver(tsens_driver);
+
+MODULE_LICENSE("GPL v2");
+MODULE_DESCRIPTION("QCOM Temperature Sensor driver");
+MODULE_ALIAS("platform:qcom-tsens");
diff --git a/drivers/thermal/qcom/tsens.h b/drivers/thermal/qcom/tsens.h
new file mode 100644
index 000000000000..911c1978892b
--- /dev/null
+++ b/drivers/thermal/qcom/tsens.h
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2015, The Linux Foundation. All rights reserved.
+ *
+ * This software is licensed under the terms of the GNU General Public
+ * License version 2, as published by the Free Software Foundation, and
+ * may be copied, distributed, and modified under those terms.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+#ifndef __QCOM_TSENS_H__
+#define __QCOM_TSENS_H__
+
+#define ONE_PT_CALIB		0x1
+#define ONE_PT_CALIB2		0x2
+#define TWO_PT_CALIB		0x3
+
+#include <linux/thermal.h>
+
+struct tsens_device;
+
+struct tsens_sensor {
+	struct tsens_device		*tmdev;
+	struct thermal_zone_device	*tzd;
+	int				offset;
+	int				id;
+	int				hw_id;
+	int				slope;
+	u32				status;
+};
+
+/**
+ * struct tsens_ops - operations as supported by the tsens device
+ * @init: Function to initialize the tsens device
+ * @calibrate: Function to calibrate the tsens device
+ * @get_temp: Function which returns the temp in millidegC
+ * @enable: Function to enable (clocks/power) tsens device
+ * @disable: Function to disable the tsens device
+ * @suspend: Function to suspend the tsens device
+ * @resume: Function to resume the tsens device
+ * @get_trend: Function to get the thermal/temp trend
+ */
+struct tsens_ops {
+	/* mandatory callbacks */
+	int (*init)(struct tsens_device *);
+	int (*calibrate)(struct tsens_device *);
+	int (*get_temp)(struct tsens_device *, int, int *);
+	/* optional callbacks */
+	int (*enable)(struct tsens_device *, int);
+	void (*disable)(struct tsens_device *);
+	int (*suspend)(struct tsens_device *);
+	int (*resume)(struct tsens_device *);
+	int (*get_trend)(struct tsens_device *, int, enum thermal_trend *);
+};
+
+/**
+ * struct tsens_data - tsens instance specific data
+ * @num_sensors: Max number of sensors supported by platform
+ * @ops: operations the tsens instance supports
+ * @hw_ids: Subset of sensors ids supported by platform, if not the first n
+ */
+struct tsens_data {
+	const u32		num_sensors;
+	const struct tsens_ops	*ops;
+	unsigned int		*hw_ids;
+};
+
+/* Registers to be saved/restored across a context loss */
+struct tsens_context {
+	int	threshold;
+	int	control;
+};
+
+struct tsens_device {
+	struct device			*dev;
+	u32				num_sensors;
+	struct regmap			*map;
+	struct regmap_field		*status_field;
+	struct tsens_context		ctx;
+	bool				trdy;
+	const struct tsens_ops		*ops;
+	struct tsens_sensor		sensor[0];
+};
+
+char *qfprom_read(struct device *, const char *);
+void compute_intercept_slope(struct tsens_device *, u32 *, u32 *, u32);
+int init_common(struct tsens_device *);
+int get_temp_common(struct tsens_device *, int, int *);
+
+extern const struct tsens_data data_8916, data_8974, data_8960, data_8996;
+
+#endif /* __QCOM_TSENS_H__ */
diff --git a/drivers/thermal/qoriq_thermal.c b/drivers/thermal/qoriq_thermal.c
new file mode 100644
index 000000000000..644ba526d9ea
--- /dev/null
+++ b/drivers/thermal/qoriq_thermal.c
@@ -0,0 +1,328 @@
+/*
+ * Copyright 2016 Freescale Semiconductor, Inc.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/platform_device.h>
+#include <linux/err.h>
+#include <linux/io.h>
+#include <linux/of.h>
+#include <linux/of_address.h>
+#include <linux/thermal.h>
+
+#include "thermal_core.h"
+
+#define SITES_MAX	16
+
+/*
+ * QorIQ TMU Registers
+ */
+struct qoriq_tmu_site_regs {
+	u32 tritsr;		/* Immediate Temperature Site Register */
+	u32 tratsr;		/* Average Temperature Site Register */
+	u8 res0[0x8];
+};
+
+struct qoriq_tmu_regs {
+	u32 tmr;		/* Mode Register */
+#define TMR_DISABLE	0x0
+#define TMR_ME		0x80000000
+#define TMR_ALPF	0x0c000000
+	u32 tsr;		/* Status Register */
+	u32 tmtmir;		/* Temperature measurement interval Register */
+#define TMTMIR_DEFAULT	0x0000000f
+	u8 res0[0x14];
+	u32 tier;		/* Interrupt Enable Register */
+#define TIER_DISABLE	0x0
+	u32 tidr;		/* Interrupt Detect Register */
+	u32 tiscr;		/* Interrupt Site Capture Register */
+	u32 ticscr;		/* Interrupt Critical Site Capture Register */
+	u8 res1[0x10];
+	u32 tmhtcrh;		/* High Temperature Capture Register */
+	u32 tmhtcrl;		/* Low Temperature Capture Register */
+	u8 res2[0x8];
+	u32 tmhtitr;		/* High Temperature Immediate Threshold */
+	u32 tmhtatr;		/* High Temperature Average Threshold */
+	u32 tmhtactr;	/* High Temperature Average Crit Threshold */
+	u8 res3[0x24];
+	u32 ttcfgr;		/* Temperature Configuration Register */
+	u32 tscfgr;		/* Sensor Configuration Register */
+	u8 res4[0x78];
+	struct qoriq_tmu_site_regs site[SITES_MAX];
+	u8 res5[0x9f8];
+	u32 ipbrr0;		/* IP Block Revision Register 0 */
+	u32 ipbrr1;		/* IP Block Revision Register 1 */
+	u8 res6[0x310];
+	u32 ttr0cr;		/* Temperature Range 0 Control Register */
+	u32 ttr1cr;		/* Temperature Range 1 Control Register */
+	u32 ttr2cr;		/* Temperature Range 2 Control Register */
+	u32 ttr3cr;		/* Temperature Range 3 Control Register */
+};
+
+/*
+ * Thermal zone data
+ */
+struct qoriq_tmu_data {
+	struct thermal_zone_device *tz;
+	struct qoriq_tmu_regs __iomem *regs;
+	int sensor_id;
+	bool little_endian;
+};
+
+static void tmu_write(struct qoriq_tmu_data *p, u32 val, void __iomem *addr)
+{
+	if (p->little_endian)
+		iowrite32(val, addr);
+	else
+		iowrite32be(val, addr);
+}
+
+static u32 tmu_read(struct qoriq_tmu_data *p, void __iomem *addr)
+{
+	if (p->little_endian)
+		return ioread32(addr);
+	else
+		return ioread32be(addr);
+}
+
+static int tmu_get_temp(void *p, int *temp)
+{
+	u32 val;
+	struct qoriq_tmu_data *data = p;
+
+	val = tmu_read(data, &data->regs->site[data->sensor_id].tritsr);
+	*temp = (val & 0xff) * 1000;
+
+	return 0;
+}
+
+static int qoriq_tmu_get_sensor_id(void)
+{
+	int ret, id;
+	struct of_phandle_args sensor_specs;
+	struct device_node *np, *sensor_np;
+
+	np = of_find_node_by_name(NULL, "thermal-zones");
+	if (!np)
+		return -ENODEV;
+
+	sensor_np = of_get_next_child(np, NULL);
+	ret = of_parse_phandle_with_args(sensor_np, "thermal-sensors",
+			"#thermal-sensor-cells",
+			0, &sensor_specs);
+	if (ret) {
+		of_node_put(np);
+		of_node_put(sensor_np);
+		return ret;
+	}
+
+	if (sensor_specs.args_count >= 1) {
+		id = sensor_specs.args[0];
+		WARN(sensor_specs.args_count > 1,
+				"%s: too many cells in sensor specifier %d\n",
+				sensor_specs.np->name, sensor_specs.args_count);
+	} else {
+		id = 0;
+	}
+
+	of_node_put(np);
+	of_node_put(sensor_np);
+
+	return id;
+}
+
+static int qoriq_tmu_calibration(struct platform_device *pdev)
+{
+	int i, val, len;
+	u32 range[4];
+	const u32 *calibration;
+	struct device_node *np = pdev->dev.of_node;
+	struct qoriq_tmu_data *data = platform_get_drvdata(pdev);
+
+	if (of_property_read_u32_array(np, "fsl,tmu-range", range, 4)) {
+		dev_err(&pdev->dev, "missing calibration range.\n");
+		return -ENODEV;
+	}
+
+	/* Init temperature range registers */
+	tmu_write(data, range[0], &data->regs->ttr0cr);
+	tmu_write(data, range[1], &data->regs->ttr1cr);
+	tmu_write(data, range[2], &data->regs->ttr2cr);
+	tmu_write(data, range[3], &data->regs->ttr3cr);
+
+	calibration = of_get_property(np, "fsl,tmu-calibration", &len);
+	if (calibration == NULL || len % 8) {
+		dev_err(&pdev->dev, "invalid calibration data.\n");
+		return -ENODEV;
+	}
+
+	for (i = 0; i < len; i += 8, calibration += 2) {
+		val = of_read_number(calibration, 1);
+		tmu_write(data, val, &data->regs->ttcfgr);
+		val = of_read_number(calibration + 1, 1);
+		tmu_write(data, val, &data->regs->tscfgr);
+	}
+
+	return 0;
+}
+
+static void qoriq_tmu_init_device(struct qoriq_tmu_data *data)
+{
+	/* Disable interrupt, using polling instead */
+	tmu_write(data, TIER_DISABLE, &data->regs->tier);
+
+	/* Set update_interval */
+	tmu_write(data, TMTMIR_DEFAULT, &data->regs->tmtmir);
+
+	/* Disable monitoring */
+	tmu_write(data, TMR_DISABLE, &data->regs->tmr);
+}
+
+static struct thermal_zone_of_device_ops tmu_tz_ops = {
+	.get_temp = tmu_get_temp,
+};
+
+static int qoriq_tmu_probe(struct platform_device *pdev)
+{
+	int ret;
+	const struct thermal_trip *trip;
+	struct qoriq_tmu_data *data;
+	struct device_node *np = pdev->dev.of_node;
+	u32 site = 0;
+
+	if (!np) {
+		dev_err(&pdev->dev, "Device OF-Node is NULL");
+		return -ENODEV;
+	}
+
+	data = devm_kzalloc(&pdev->dev, sizeof(struct qoriq_tmu_data),
+			    GFP_KERNEL);
+	if (!data)
+		return -ENOMEM;
+
+	platform_set_drvdata(pdev, data);
+
+	data->little_endian = of_property_read_bool(np, "little-endian");
+
+	data->sensor_id = qoriq_tmu_get_sensor_id();
+	if (data->sensor_id < 0) {
+		dev_err(&pdev->dev, "Failed to get sensor id\n");
+		ret = -ENODEV;
+		goto err_iomap;
+	}
+
+	data->regs = of_iomap(np, 0);
+	if (!data->regs) {
+		dev_err(&pdev->dev, "Failed to get memory region\n");
+		ret = -ENODEV;
+		goto err_iomap;
+	}
+
+	qoriq_tmu_init_device(data);	/* TMU initialization */
+
+	ret = qoriq_tmu_calibration(pdev);	/* TMU calibration */
+	if (ret < 0)
+		goto err_tmu;
+
+	data->tz = thermal_zone_of_sensor_register(&pdev->dev, data->sensor_id,
+				data, &tmu_tz_ops);
+	if (IS_ERR(data->tz)) {
+		ret = PTR_ERR(data->tz);
+		dev_err(&pdev->dev,
+			"Failed to register thermal zone device %d\n", ret);
+		goto err_tmu;
+	}
+
+	trip = of_thermal_get_trip_points(data->tz);
+
+	/* Enable monitoring */
+	site |= 0x1 << (15 - data->sensor_id);
+	tmu_write(data, site | TMR_ME | TMR_ALPF, &data->regs->tmr);
+
+	return 0;
+
+err_tmu:
+	iounmap(data->regs);
+
+err_iomap:
+	platform_set_drvdata(pdev, NULL);
+
+	return ret;
+}
+
+static int qoriq_tmu_remove(struct platform_device *pdev)
+{
+	struct qoriq_tmu_data *data = platform_get_drvdata(pdev);
+
+	thermal_zone_of_sensor_unregister(&pdev->dev, data->tz);
+
+	/* Disable monitoring */
+	tmu_write(data, TMR_DISABLE, &data->regs->tmr);
+
+	iounmap(data->regs);
+	platform_set_drvdata(pdev, NULL);
+
+	return 0;
+}
+
+#ifdef CONFIG_PM_SLEEP
+static int qoriq_tmu_suspend(struct device *dev)
+{
+	u32 tmr;
+	struct qoriq_tmu_data *data = dev_get_drvdata(dev);
+
+	/* Disable monitoring */
+	tmr = tmu_read(data, &data->regs->tmr);
+	tmr &= ~TMR_ME;
+	tmu_write(data, tmr, &data->regs->tmr);
+
+	return 0;
+}
+
+static int qoriq_tmu_resume(struct device *dev)
+{
+	u32 tmr;
+	struct qoriq_tmu_data *data = dev_get_drvdata(dev);
+
+	/* Enable monitoring */
+	tmr = tmu_read(data, &data->regs->tmr);
+	tmr |= TMR_ME;
+	tmu_write(data, tmr, &data->regs->tmr);
+
+	return 0;
+}
+#endif
+
+static SIMPLE_DEV_PM_OPS(qoriq_tmu_pm_ops,
+			 qoriq_tmu_suspend, qoriq_tmu_resume);
+
+static const struct of_device_id qoriq_tmu_match[] = {
+	{ .compatible = "fsl,qoriq-tmu", },
+	{},
+};
+MODULE_DEVICE_TABLE(of, qoriq_tmu_match);
+
+static struct platform_driver qoriq_tmu = {
+	.driver	= {
+		.name		= "qoriq_thermal",
+		.pm		= &qoriq_tmu_pm_ops,
+		.of_match_table	= qoriq_tmu_match,
+	},
+	.probe	= qoriq_tmu_probe,
+	.remove	= qoriq_tmu_remove,
+};
+module_platform_driver(qoriq_tmu);
+
+MODULE_AUTHOR("Jia Hongtao <hongtao.jia@nxp.com>");
+MODULE_DESCRIPTION("QorIQ Thermal Monitoring Unit driver");
+MODULE_LICENSE("GPL v2");
diff --git a/drivers/thermal/rcar_thermal.c b/drivers/thermal/rcar_thermal.c
index 5f817923f374..73e5fee6cf1d 100644
--- a/drivers/thermal/rcar_thermal.c
+++ b/drivers/thermal/rcar_thermal.c
@@ -31,6 +31,8 @@
 #include <linux/spinlock.h>
 #include <linux/thermal.h>
 
+#include "thermal_hwmon.h"
+
 #define IDLE_INTERVAL	5000
 
 #define COMMON_STR	0x00
@@ -75,6 +77,8 @@ struct rcar_thermal_priv {
 #define rcar_priv_to_dev(priv)		((priv)->common->dev)
 #define rcar_has_irq_support(priv)	((priv)->common->base)
 #define rcar_id_to_shift(priv)		((priv)->id * 8)
+#define rcar_of_data(dev)		((unsigned long)of_device_get_match_data(dev))
+#define rcar_use_of_thermal(dev)	(rcar_of_data(dev) == USE_OF_THERMAL)
 
 #define USE_OF_THERMAL	1
 static const struct of_device_id rcar_thermal_dt_ids[] = {
@@ -354,7 +358,8 @@ static void rcar_thermal_work(struct work_struct *work)
 		return;
 
 	if (nctemp != cctemp)
-		thermal_zone_device_update(priv->zone);
+		thermal_zone_device_update(priv->zone,
+					   THERMAL_EVENT_UNSPECIFIED);
 }
 
 static u32 rcar_thermal_had_changed(struct rcar_thermal_priv *priv, u32 status)
@@ -415,7 +420,10 @@ static int rcar_thermal_remove(struct platform_device *pdev)
 
 	rcar_thermal_for_each_priv(priv, common) {
 		rcar_thermal_irq_disable(priv);
-		thermal_zone_device_unregister(priv->zone);
+		if (rcar_use_of_thermal(dev))
+			thermal_remove_hwmon_sysfs(priv->zone);
+		else
+			thermal_zone_device_unregister(priv->zone);
 	}
 
 	pm_runtime_put(dev);
@@ -430,7 +438,6 @@ static int rcar_thermal_probe(struct platform_device *pdev)
 	struct rcar_thermal_priv *priv;
 	struct device *dev = &pdev->dev;
 	struct resource *res, *irq;
-	unsigned long of_data = (unsigned long)of_device_get_match_data(dev);
 	int mres = 0;
 	int i;
 	int ret = -ENODEV;
@@ -491,7 +498,7 @@ static int rcar_thermal_probe(struct platform_device *pdev)
 		if (ret < 0)
 			goto error_unregister;
 
-		if (of_data == USE_OF_THERMAL)
+		if (rcar_use_of_thermal(dev))
 			priv->zone = devm_thermal_zone_of_sensor_register(
 						dev, i, priv,
 						&rcar_thermal_zone_of_ops);
@@ -508,6 +515,17 @@ static int rcar_thermal_probe(struct platform_device *pdev)
 			goto error_unregister;
 		}
 
+		if (rcar_use_of_thermal(dev)) {
+			/*
+			 * thermal_zone doesn't enable hwmon as default,
+			 * but, enable it here to keep compatible
+			 */
+			priv->zone->tzp->no_hwmon = false;
+			ret = thermal_add_hwmon_sysfs(priv->zone);
+			if (ret)
+				goto error_unregister;
+		}
+
 		rcar_thermal_irq_enable(priv);
 
 		list_move_tail(&priv->list, &common->head);
diff --git a/drivers/thermal/rockchip_thermal.c b/drivers/thermal/rockchip_thermal.c
index 5d491f16a866..e227a9f0acf7 100644
--- a/drivers/thermal/rockchip_thermal.c
+++ b/drivers/thermal/rockchip_thermal.c
@@ -96,6 +96,7 @@ struct chip_tsadc_table {
  * @initialize: SoC special initialize tsadc controller method
  * @irq_ack: clear the interrupt
  * @get_temp: get the temperature
+ * @set_alarm_temp: set the high temperature interrupt
  * @set_tshut_temp: set the hardware-controlled shutdown temperature
  * @set_tshut_mode: set the hardware-controlled shutdown mode
  * @table: the chip-specific conversion table
@@ -119,6 +120,8 @@ struct rockchip_tsadc_chip {
 	/* Per-sensor methods */
 	int (*get_temp)(struct chip_tsadc_table table,
 			int chn, void __iomem *reg, int *temp);
+	void (*set_alarm_temp)(struct chip_tsadc_table table,
+			       int chn, void __iomem *reg, int temp);
 	void (*set_tshut_temp)(struct chip_tsadc_table table,
 			       int chn, void __iomem *reg, int temp);
 	void (*set_tshut_mode)(int chn, void __iomem *reg, enum tshut_mode m);
@@ -183,6 +186,7 @@ struct rockchip_thermal_data {
 #define TSADCV2_INT_EN				0x08
 #define TSADCV2_INT_PD				0x0c
 #define TSADCV2_DATA(chn)			(0x20 + (chn) * 0x04)
+#define TSADCV2_COMP_INT(chn)		        (0x30 + (chn) * 0x04)
 #define TSADCV2_COMP_SHUT(chn)		        (0x40 + (chn) * 0x04)
 #define TSADCV2_HIGHT_INT_DEBOUNCE		0x60
 #define TSADCV2_HIGHT_TSHUT_DEBOUNCE		0x64
@@ -207,18 +211,21 @@ struct rockchip_thermal_data {
 
 #define TSADCV2_HIGHT_INT_DEBOUNCE_COUNT	4
 #define TSADCV2_HIGHT_TSHUT_DEBOUNCE_COUNT	4
-#define TSADCV2_AUTO_PERIOD_TIME		250 /* msec */
-#define TSADCV2_AUTO_PERIOD_HT_TIME		50  /* msec */
+#define TSADCV2_AUTO_PERIOD_TIME		250 /* 250ms */
+#define TSADCV2_AUTO_PERIOD_HT_TIME		50  /* 50ms */
+#define TSADCV3_AUTO_PERIOD_TIME		1875 /* 2.5ms */
+#define TSADCV3_AUTO_PERIOD_HT_TIME		1875 /* 2.5ms */
+
 #define TSADCV2_USER_INTER_PD_SOC		0x340 /* 13 clocks */
 
 #define GRF_SARADC_TESTBIT			0x0e644
 #define GRF_TSADC_TESTBIT_L			0x0e648
 #define GRF_TSADC_TESTBIT_H			0x0e64c
 
-#define GRF_TSADC_TSEN_PD_ON			(0x30003 << 0)
-#define GRF_TSADC_TSEN_PD_OFF			(0x30000 << 0)
 #define GRF_SARADC_TESTBIT_ON			(0x10001 << 2)
 #define GRF_TSADC_TESTBIT_H_ON			(0x10001 << 2)
+#define GRF_TSADC_VCM_EN_L			(0x10001 << 7)
+#define GRF_TSADC_VCM_EN_H			(0x10001 << 7)
 
 /**
  * struct tsadc_table - code to temperature conversion table
@@ -394,13 +401,17 @@ static u32 rk_tsadcv2_temp_to_code(struct chip_tsadc_table table,
 				   int temp)
 {
 	int high, low, mid;
+	u32 error = 0;
 
 	low = 0;
 	high = table.length - 1;
 	mid = (high + low) / 2;
 
-	if (temp < table.id[low].temp || temp > table.id[high].temp)
-		return 0;
+	/* Return mask code data when the temp is over table range */
+	if (temp < table.id[low].temp || temp > table.id[high].temp) {
+		error = table.data_mask;
+		goto exit;
+	}
 
 	while (low <= high) {
 		if (temp == table.id[mid].temp)
@@ -412,7 +423,9 @@ static u32 rk_tsadcv2_temp_to_code(struct chip_tsadc_table table,
 		mid = (low + high) / 2;
 	}
 
-	return 0;
+exit:
+	pr_err("Invalid the conversion, error=%d\n", error);
+	return error;
 }
 
 static int rk_tsadcv2_code_to_temp(struct chip_tsadc_table table, u32 code,
@@ -543,14 +556,34 @@ static void rk_tsadcv3_initialize(struct regmap *grf, void __iomem *regs,
 		/* Set interleave value to workround ic time sync issue */
 		writel_relaxed(TSADCV2_USER_INTER_PD_SOC, regs +
 			       TSADCV2_USER_CON);
+
+		writel_relaxed(TSADCV2_AUTO_PERIOD_TIME,
+			       regs + TSADCV2_AUTO_PERIOD);
+		writel_relaxed(TSADCV2_HIGHT_INT_DEBOUNCE_COUNT,
+			       regs + TSADCV2_HIGHT_INT_DEBOUNCE);
+		writel_relaxed(TSADCV2_AUTO_PERIOD_HT_TIME,
+			       regs + TSADCV2_AUTO_PERIOD_HT);
+		writel_relaxed(TSADCV2_HIGHT_TSHUT_DEBOUNCE_COUNT,
+			       regs + TSADCV2_HIGHT_TSHUT_DEBOUNCE);
+
 	} else {
-		regmap_write(grf, GRF_TSADC_TESTBIT_L, GRF_TSADC_TSEN_PD_ON);
-		mdelay(10);
-		regmap_write(grf, GRF_TSADC_TESTBIT_L, GRF_TSADC_TSEN_PD_OFF);
+		/* Enable the voltage common mode feature */
+		regmap_write(grf, GRF_TSADC_TESTBIT_L, GRF_TSADC_VCM_EN_L);
+		regmap_write(grf, GRF_TSADC_TESTBIT_H, GRF_TSADC_VCM_EN_H);
+
 		usleep_range(15, 100); /* The spec note says at least 15 us */
 		regmap_write(grf, GRF_SARADC_TESTBIT, GRF_SARADC_TESTBIT_ON);
 		regmap_write(grf, GRF_TSADC_TESTBIT_H, GRF_TSADC_TESTBIT_H_ON);
 		usleep_range(90, 200); /* The spec note says at least 90 us */
+
+		writel_relaxed(TSADCV3_AUTO_PERIOD_TIME,
+			       regs + TSADCV2_AUTO_PERIOD);
+		writel_relaxed(TSADCV2_HIGHT_INT_DEBOUNCE_COUNT,
+			       regs + TSADCV2_HIGHT_INT_DEBOUNCE);
+		writel_relaxed(TSADCV3_AUTO_PERIOD_HT_TIME,
+			       regs + TSADCV2_AUTO_PERIOD_HT);
+		writel_relaxed(TSADCV2_HIGHT_TSHUT_DEBOUNCE_COUNT,
+			       regs + TSADCV2_HIGHT_TSHUT_DEBOUNCE);
 	}
 
 	if (tshut_polarity == TSHUT_HIGH_ACTIVE)
@@ -559,14 +592,6 @@ static void rk_tsadcv3_initialize(struct regmap *grf, void __iomem *regs,
 	else
 		writel_relaxed(0U & ~TSADCV2_AUTO_TSHUT_POLARITY_HIGH,
 			       regs + TSADCV2_AUTO_CON);
-
-	writel_relaxed(TSADCV2_AUTO_PERIOD_TIME, regs + TSADCV2_AUTO_PERIOD);
-	writel_relaxed(TSADCV2_HIGHT_INT_DEBOUNCE_COUNT,
-		       regs + TSADCV2_HIGHT_INT_DEBOUNCE);
-	writel_relaxed(TSADCV2_AUTO_PERIOD_HT_TIME,
-		       regs + TSADCV2_AUTO_PERIOD_HT);
-	writel_relaxed(TSADCV2_HIGHT_TSHUT_DEBOUNCE_COUNT,
-		       regs + TSADCV2_HIGHT_TSHUT_DEBOUNCE);
 }
 
 static void rk_tsadcv2_irq_ack(void __iomem *regs)
@@ -628,12 +653,34 @@ static int rk_tsadcv2_get_temp(struct chip_tsadc_table table,
 	return rk_tsadcv2_code_to_temp(table, val, temp);
 }
 
+static void rk_tsadcv2_alarm_temp(struct chip_tsadc_table table,
+				  int chn, void __iomem *regs, int temp)
+{
+	u32 alarm_value, int_en;
+
+	/* Make sure the value is valid */
+	alarm_value = rk_tsadcv2_temp_to_code(table, temp);
+	if (alarm_value == table.data_mask)
+		return;
+
+	writel_relaxed(alarm_value & table.data_mask,
+		       regs + TSADCV2_COMP_INT(chn));
+
+	int_en = readl_relaxed(regs + TSADCV2_INT_EN);
+	int_en |= TSADCV2_INT_SRC_EN(chn);
+	writel_relaxed(int_en, regs + TSADCV2_INT_EN);
+}
+
 static void rk_tsadcv2_tshut_temp(struct chip_tsadc_table table,
 				  int chn, void __iomem *regs, int temp)
 {
 	u32 tshut_value, val;
 
+	/* Make sure the value is valid */
 	tshut_value = rk_tsadcv2_temp_to_code(table, temp);
+	if (tshut_value == table.data_mask)
+		return;
+
 	writel_relaxed(tshut_value, regs + TSADCV2_COMP_SHUT(chn));
 
 	/* TSHUT will be valid */
@@ -670,6 +717,7 @@ static const struct rockchip_tsadc_chip rk3228_tsadc_data = {
 	.irq_ack = rk_tsadcv3_irq_ack,
 	.control = rk_tsadcv3_control,
 	.get_temp = rk_tsadcv2_get_temp,
+	.set_alarm_temp = rk_tsadcv2_alarm_temp,
 	.set_tshut_temp = rk_tsadcv2_tshut_temp,
 	.set_tshut_mode = rk_tsadcv2_tshut_mode,
 
@@ -694,6 +742,7 @@ static const struct rockchip_tsadc_chip rk3288_tsadc_data = {
 	.irq_ack = rk_tsadcv2_irq_ack,
 	.control = rk_tsadcv2_control,
 	.get_temp = rk_tsadcv2_get_temp,
+	.set_alarm_temp = rk_tsadcv2_alarm_temp,
 	.set_tshut_temp = rk_tsadcv2_tshut_temp,
 	.set_tshut_mode = rk_tsadcv2_tshut_mode,
 
@@ -718,6 +767,7 @@ static const struct rockchip_tsadc_chip rk3366_tsadc_data = {
 	.irq_ack = rk_tsadcv3_irq_ack,
 	.control = rk_tsadcv3_control,
 	.get_temp = rk_tsadcv2_get_temp,
+	.set_alarm_temp = rk_tsadcv2_alarm_temp,
 	.set_tshut_temp = rk_tsadcv2_tshut_temp,
 	.set_tshut_mode = rk_tsadcv2_tshut_mode,
 
@@ -742,6 +792,7 @@ static const struct rockchip_tsadc_chip rk3368_tsadc_data = {
 	.irq_ack = rk_tsadcv2_irq_ack,
 	.control = rk_tsadcv2_control,
 	.get_temp = rk_tsadcv2_get_temp,
+	.set_alarm_temp = rk_tsadcv2_alarm_temp,
 	.set_tshut_temp = rk_tsadcv2_tshut_temp,
 	.set_tshut_mode = rk_tsadcv2_tshut_mode,
 
@@ -766,6 +817,7 @@ static const struct rockchip_tsadc_chip rk3399_tsadc_data = {
 	.irq_ack = rk_tsadcv3_irq_ack,
 	.control = rk_tsadcv3_control,
 	.get_temp = rk_tsadcv2_get_temp,
+	.set_alarm_temp = rk_tsadcv2_alarm_temp,
 	.set_tshut_temp = rk_tsadcv2_tshut_temp,
 	.set_tshut_mode = rk_tsadcv2_tshut_mode,
 
@@ -821,11 +873,27 @@ static irqreturn_t rockchip_thermal_alarm_irq_thread(int irq, void *dev)
 	thermal->chip->irq_ack(thermal->regs);
 
 	for (i = 0; i < thermal->chip->chn_num; i++)
-		thermal_zone_device_update(thermal->sensors[i].tzd);
+		thermal_zone_device_update(thermal->sensors[i].tzd,
+					   THERMAL_EVENT_UNSPECIFIED);
 
 	return IRQ_HANDLED;
 }
 
+static int rockchip_thermal_set_trips(void *_sensor, int low, int high)
+{
+	struct rockchip_thermal_sensor *sensor = _sensor;
+	struct rockchip_thermal_data *thermal = sensor->thermal;
+	const struct rockchip_tsadc_chip *tsadc = thermal->chip;
+
+	dev_dbg(&thermal->pdev->dev, "%s: sensor %d: low: %d, high %d\n",
+		__func__, sensor->id, low, high);
+
+	tsadc->set_alarm_temp(tsadc->table,
+			      sensor->id, thermal->regs, high);
+
+	return 0;
+}
+
 static int rockchip_thermal_get_temp(void *_sensor, int *out_temp)
 {
 	struct rockchip_thermal_sensor *sensor = _sensor;
@@ -843,6 +911,7 @@ static int rockchip_thermal_get_temp(void *_sensor, int *out_temp)
 
 static const struct thermal_zone_of_device_ops rockchip_of_thermal_ops = {
 	.get_temp = rockchip_thermal_get_temp,
+	.set_trips = rockchip_thermal_set_trips,
 };
 
 static int rockchip_configure_from_dt(struct device *dev,
diff --git a/drivers/thermal/samsung/exynos_tmu.c b/drivers/thermal/samsung/exynos_tmu.c
index f3ce94ec73b5..ad1186dd6132 100644
--- a/drivers/thermal/samsung/exynos_tmu.c
+++ b/drivers/thermal/samsung/exynos_tmu.c
@@ -225,7 +225,7 @@ static void exynos_report_trigger(struct exynos_tmu_data *p)
 		return;
 	}
 
-	thermal_zone_device_update(tz);
+	thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
 
 	mutex_lock(&tz->lock);
 	/* Find the level for which trip happened */
diff --git a/drivers/thermal/st/st_thermal_memmap.c b/drivers/thermal/st/st_thermal_memmap.c
index fc0c9e198710..91d42319de27 100644
--- a/drivers/thermal/st/st_thermal_memmap.c
+++ b/drivers/thermal/st/st_thermal_memmap.c
@@ -42,7 +42,8 @@ static irqreturn_t st_mmap_thermal_trip_handler(int irq, void *sdata)
 {
 	struct st_thermal_sensor *sensor = sdata;
 
-	thermal_zone_device_update(sensor->thermal_dev);
+	thermal_zone_device_update(sensor->thermal_dev,
+				   THERMAL_EVENT_UNSPECIFIED);
 
 	return IRQ_HANDLED;
 }
diff --git a/drivers/thermal/tango_thermal.c b/drivers/thermal/tango_thermal.c
index 70e0d9f406e9..201304aeafeb 100644
--- a/drivers/thermal/tango_thermal.c
+++ b/drivers/thermal/tango_thermal.c
@@ -64,6 +64,12 @@ static const struct thermal_zone_of_device_ops ops = {
 	.get_temp	= tango_get_temp,
 };
 
+static void tango_thermal_init(struct tango_thermal_priv *priv)
+{
+	writel(0, priv->base + TEMPSI_CFG);
+	writel(CMD_ON, priv->base + TEMPSI_CMD);
+}
+
 static int tango_thermal_probe(struct platform_device *pdev)
 {
 	struct resource *res;
@@ -79,14 +85,22 @@ static int tango_thermal_probe(struct platform_device *pdev)
 	if (IS_ERR(priv->base))
 		return PTR_ERR(priv->base);
 
+	platform_set_drvdata(pdev, priv);
 	priv->thresh_idx = IDX_MIN;
-	writel(0, priv->base + TEMPSI_CFG);
-	writel(CMD_ON, priv->base + TEMPSI_CMD);
+	tango_thermal_init(priv);
 
 	tzdev = devm_thermal_zone_of_sensor_register(&pdev->dev, 0, priv, &ops);
 	return PTR_ERR_OR_ZERO(tzdev);
 }
 
+static int __maybe_unused tango_thermal_resume(struct device *dev)
+{
+	tango_thermal_init(dev_get_drvdata(dev));
+	return 0;
+}
+
+static SIMPLE_DEV_PM_OPS(tango_thermal_pm, NULL, tango_thermal_resume);
+
 static const struct of_device_id tango_sensor_ids[] = {
 	{
 		.compatible = "sigma,smp8758-thermal",
@@ -99,6 +113,7 @@ static struct platform_driver tango_thermal_driver = {
 	.driver	= {
 		.name		= "tango-thermal",
 		.of_match_table	= tango_sensor_ids,
+		.pm		= &tango_thermal_pm,
 	},
 };
 
diff --git a/drivers/thermal/tegra/soctherm.c b/drivers/thermal/tegra/soctherm.c
index b8651726201e..7d2db23d71a3 100644
--- a/drivers/thermal/tegra/soctherm.c
+++ b/drivers/thermal/tegra/soctherm.c
@@ -30,6 +30,7 @@
 
 #include <dt-bindings/thermal/tegra124-soctherm.h>
 
+#include "../thermal_core.h"
 #include "soctherm.h"
 
 #define SENSOR_CONFIG0				0
@@ -67,35 +68,228 @@
 #define READBACK_ADD_HALF			BIT(7)
 #define READBACK_NEGATE				BIT(0)
 
+/*
+ * THERMCTL_LEVEL0_GROUP_CPU is defined in soctherm.h
+ * because it will be used by tegraxxx_soctherm.c
+ */
+#define THERMCTL_LVL0_CPU0_EN_MASK		BIT(8)
+#define THERMCTL_LVL0_CPU0_CPU_THROT_MASK	(0x3 << 5)
+#define THERMCTL_LVL0_CPU0_CPU_THROT_LIGHT	0x1
+#define THERMCTL_LVL0_CPU0_CPU_THROT_HEAVY	0x2
+#define THERMCTL_LVL0_CPU0_GPU_THROT_MASK	(0x3 << 3)
+#define THERMCTL_LVL0_CPU0_GPU_THROT_LIGHT	0x1
+#define THERMCTL_LVL0_CPU0_GPU_THROT_HEAVY	0x2
+#define THERMCTL_LVL0_CPU0_MEM_THROT_MASK	BIT(2)
+#define THERMCTL_LVL0_CPU0_STATUS_MASK		0x3
+
+#define THERMCTL_LVL0_UP_STATS			0x10
+#define THERMCTL_LVL0_DN_STATS			0x14
+
+#define THERMCTL_STATS_CTL			0x94
+#define STATS_CTL_CLR_DN			0x8
+#define STATS_CTL_EN_DN				0x4
+#define STATS_CTL_CLR_UP			0x2
+#define STATS_CTL_EN_UP				0x1
+
+#define THROT_GLOBAL_CFG			0x400
+#define THROT_GLOBAL_ENB_MASK			BIT(0)
+
+#define CPU_PSKIP_STATUS			0x418
+#define XPU_PSKIP_STATUS_M_MASK			(0xff << 12)
+#define XPU_PSKIP_STATUS_N_MASK			(0xff << 4)
+#define XPU_PSKIP_STATUS_SW_OVERRIDE_MASK	BIT(1)
+#define XPU_PSKIP_STATUS_ENABLED_MASK		BIT(0)
+
+#define THROT_PRIORITY_LOCK			0x424
+#define THROT_PRIORITY_LOCK_PRIORITY_MASK	0xff
+
+#define THROT_STATUS				0x428
+#define THROT_STATUS_BREACH_MASK		BIT(12)
+#define THROT_STATUS_STATE_MASK			(0xff << 4)
+#define THROT_STATUS_ENABLED_MASK		BIT(0)
+
+#define THROT_PSKIP_CTRL_LITE_CPU		0x430
+#define THROT_PSKIP_CTRL_ENABLE_MASK            BIT(31)
+#define THROT_PSKIP_CTRL_DIVIDEND_MASK          (0xff << 8)
+#define THROT_PSKIP_CTRL_DIVISOR_MASK           0xff
+#define THROT_PSKIP_CTRL_VECT_GPU_MASK          (0x7 << 16)
+#define THROT_PSKIP_CTRL_VECT_CPU_MASK          (0x7 << 8)
+#define THROT_PSKIP_CTRL_VECT2_CPU_MASK         0x7
+
+#define THROT_VECT_NONE				0x0 /* 3'b000 */
+#define THROT_VECT_LOW				0x1 /* 3'b001 */
+#define THROT_VECT_MED				0x3 /* 3'b011 */
+#define THROT_VECT_HIGH				0x7 /* 3'b111 */
+
+#define THROT_PSKIP_RAMP_LITE_CPU		0x434
+#define THROT_PSKIP_RAMP_SEQ_BYPASS_MODE_MASK	BIT(31)
+#define THROT_PSKIP_RAMP_DURATION_MASK		(0xffff << 8)
+#define THROT_PSKIP_RAMP_STEP_MASK		0xff
+
+#define THROT_PRIORITY_LITE			0x444
+#define THROT_PRIORITY_LITE_PRIO_MASK		0xff
+
+#define THROT_DELAY_LITE			0x448
+#define THROT_DELAY_LITE_DELAY_MASK		0xff
+
+/* car register offsets needed for enabling HW throttling */
+#define CAR_SUPER_CCLKG_DIVIDER			0x36c
+#define CDIVG_USE_THERM_CONTROLS_MASK		BIT(30)
+
+/* ccroc register offsets needed for enabling HW throttling for Tegra132 */
+#define CCROC_SUPER_CCLKG_DIVIDER		0x024
+
+#define CCROC_GLOBAL_CFG			0x148
+
+#define CCROC_THROT_PSKIP_RAMP_CPU		0x150
+#define CCROC_THROT_PSKIP_RAMP_SEQ_BYPASS_MODE_MASK	BIT(31)
+#define CCROC_THROT_PSKIP_RAMP_DURATION_MASK	(0xffff << 8)
+#define CCROC_THROT_PSKIP_RAMP_STEP_MASK	0xff
+
+#define CCROC_THROT_PSKIP_CTRL_CPU		0x154
+#define CCROC_THROT_PSKIP_CTRL_ENB_MASK		BIT(31)
+#define CCROC_THROT_PSKIP_CTRL_DIVIDEND_MASK	(0xff << 8)
+#define CCROC_THROT_PSKIP_CTRL_DIVISOR_MASK	0xff
+
 /* get val from register(r) mask bits(m) */
 #define REG_GET_MASK(r, m)	(((r) & (m)) >> (ffs(m) - 1))
 /* set val(v) to mask bits(m) of register(r) */
 #define REG_SET_MASK(r, m, v)	(((r) & ~(m)) | \
 				 (((v) & (m >> (ffs(m) - 1))) << (ffs(m) - 1)))
 
+/* get dividend from the depth */
+#define THROT_DEPTH_DIVIDEND(depth)	((256 * (100 - (depth)) / 100) - 1)
+
+/* get THROT_PSKIP_xxx offset per LIGHT/HEAVY throt and CPU/GPU dev */
+#define THROT_OFFSET			0x30
+#define THROT_PSKIP_CTRL(throt, dev)	(THROT_PSKIP_CTRL_LITE_CPU + \
+					(THROT_OFFSET * throt) + (8 * dev))
+#define THROT_PSKIP_RAMP(throt, dev)	(THROT_PSKIP_RAMP_LITE_CPU + \
+					(THROT_OFFSET * throt) + (8 * dev))
+
+/* get THROT_xxx_CTRL offset per LIGHT/HEAVY throt */
+#define THROT_PRIORITY_CTRL(throt)	(THROT_PRIORITY_LITE + \
+					(THROT_OFFSET * throt))
+#define THROT_DELAY_CTRL(throt)		(THROT_DELAY_LITE + \
+					(THROT_OFFSET * throt))
+
+/* get CCROC_THROT_PSKIP_xxx offset per HIGH/MED/LOW vect*/
+#define CCROC_THROT_OFFSET			0x0c
+#define CCROC_THROT_PSKIP_CTRL_CPU_REG(vect)    (CCROC_THROT_PSKIP_CTRL_CPU + \
+						(CCROC_THROT_OFFSET * vect))
+#define CCROC_THROT_PSKIP_RAMP_CPU_REG(vect)    (CCROC_THROT_PSKIP_RAMP_CPU + \
+						(CCROC_THROT_OFFSET * vect))
+
+/* get THERMCTL_LEVELx offset per CPU/GPU/MEM/TSENSE rg and LEVEL0~3 lv */
+#define THERMCTL_LVL_REGS_SIZE		0x20
+#define THERMCTL_LVL_REG(rg, lv)	((rg) + ((lv) * THERMCTL_LVL_REGS_SIZE))
+
 static const int min_low_temp = -127000;
 static const int max_high_temp = 127000;
 
+enum soctherm_throttle_id {
+	THROTTLE_LIGHT = 0,
+	THROTTLE_HEAVY,
+	THROTTLE_SIZE,
+};
+
+enum soctherm_throttle_dev_id {
+	THROTTLE_DEV_CPU = 0,
+	THROTTLE_DEV_GPU,
+	THROTTLE_DEV_SIZE,
+};
+
+static const char *const throt_names[] = {
+	[THROTTLE_LIGHT] = "light",
+	[THROTTLE_HEAVY] = "heavy",
+};
+
+struct tegra_soctherm;
 struct tegra_thermctl_zone {
 	void __iomem *reg;
 	struct device *dev;
+	struct tegra_soctherm *ts;
 	struct thermal_zone_device *tz;
 	const struct tegra_tsensor_group *sg;
 };
 
+struct soctherm_throt_cfg {
+	const char *name;
+	unsigned int id;
+	u8 priority;
+	u8 cpu_throt_level;
+	u32 cpu_throt_depth;
+	struct thermal_cooling_device *cdev;
+	bool init;
+};
+
 struct tegra_soctherm {
 	struct reset_control *reset;
 	struct clk *clock_tsensor;
 	struct clk *clock_soctherm;
 	void __iomem *regs;
-	struct thermal_zone_device **thermctl_tzs;
+	void __iomem *clk_regs;
+	void __iomem *ccroc_regs;
 
 	u32 *calib;
+	struct thermal_zone_device **thermctl_tzs;
 	struct tegra_soctherm_soc *soc;
 
+	struct soctherm_throt_cfg throt_cfgs[THROTTLE_SIZE];
+
 	struct dentry *debugfs_dir;
 };
 
+/**
+ * clk_writel() - writes a value to a CAR register
+ * @ts: pointer to a struct tegra_soctherm
+ * @v: the value to write
+ * @reg: the register offset
+ *
+ * Writes @v to @reg.  No return value.
+ */
+static inline void clk_writel(struct tegra_soctherm *ts, u32 value, u32 reg)
+{
+	writel(value, (ts->clk_regs + reg));
+}
+
+/**
+ * clk_readl() - reads specified register from CAR IP block
+ * @ts: pointer to a struct tegra_soctherm
+ * @reg: register address to be read
+ *
+ * Return: the value of the register
+ */
+static inline u32 clk_readl(struct tegra_soctherm *ts, u32 reg)
+{
+	return readl(ts->clk_regs + reg);
+}
+
+/**
+ * ccroc_writel() - writes a value to a CCROC register
+ * @ts: pointer to a struct tegra_soctherm
+ * @v: the value to write
+ * @reg: the register offset
+ *
+ * Writes @v to @reg.  No return value.
+ */
+static inline void ccroc_writel(struct tegra_soctherm *ts, u32 value, u32 reg)
+{
+	writel(value, (ts->ccroc_regs + reg));
+}
+
+/**
+ * ccroc_readl() - reads specified register from CCROC IP block
+ * @ts: pointer to a struct tegra_soctherm
+ * @reg: register address to be read
+ *
+ * Return: the value of the register
+ */
+static inline u32 ccroc_readl(struct tegra_soctherm *ts, u32 reg)
+{
+	return readl(ts->ccroc_regs + reg);
+}
+
 static void enable_tsensor(struct tegra_soctherm *tegra, unsigned int i)
 {
 	const struct tegra_tsensor *sensor = &tegra->soc->tsensors[i];
@@ -150,11 +344,17 @@ static int tegra_thermctl_get_temp(void *data, int *out_temp)
 static int
 thermtrip_program(struct device *dev, const struct tegra_tsensor_group *sg,
 		  int trip_temp);
+static int
+throttrip_program(struct device *dev, const struct tegra_tsensor_group *sg,
+		  struct soctherm_throt_cfg *stc, int trip_temp);
+static struct soctherm_throt_cfg *
+find_throttle_cfg_by_name(struct tegra_soctherm *ts, const char *name);
 
 static int tegra_thermctl_set_trip_temp(void *data, int trip, int temp)
 {
 	struct tegra_thermctl_zone *zone = data;
 	struct thermal_zone_device *tz = zone->tz;
+	struct tegra_soctherm *ts = zone->ts;
 	const struct tegra_tsensor_group *sg = zone->sg;
 	struct device *dev = zone->dev;
 	enum thermal_trip_type type;
@@ -167,10 +367,29 @@ static int tegra_thermctl_set_trip_temp(void *data, int trip, int temp)
 	if (ret)
 		return ret;
 
-	if (type != THERMAL_TRIP_CRITICAL)
-		return 0;
+	if (type == THERMAL_TRIP_CRITICAL) {
+		return thermtrip_program(dev, sg, temp);
+	} else if (type == THERMAL_TRIP_HOT) {
+		int i;
+
+		for (i = 0; i < THROTTLE_SIZE; i++) {
+			struct thermal_cooling_device *cdev;
+			struct soctherm_throt_cfg *stc;
+
+			if (!ts->throt_cfgs[i].init)
+				continue;
+
+			cdev = ts->throt_cfgs[i].cdev;
+			if (get_thermal_instance(tz, cdev, trip))
+				stc = find_throttle_cfg_by_name(ts, cdev->type);
+			else
+				continue;
+
+			return throttrip_program(dev, sg, stc, temp);
+		}
+	}
 
-	return thermtrip_program(dev, sg, temp);
+	return 0;
 }
 
 static const struct thermal_zone_of_device_ops tegra_of_thermal_ops = {
@@ -238,14 +457,110 @@ static int thermtrip_program(struct device *dev,
 }
 
 /**
+ * throttrip_program() - Configures the hardware to throttle the
+ * pulse if a given sensor group reaches a given temperature
+ * @dev: ptr to the struct device for the SOC_THERM IP block
+ * @sg: pointer to the sensor group to set the thermtrip temperature for
+ * @stc: pointer to the throttle need to be triggered
+ * @trip_temp: the temperature in millicelsius to trigger the thermal trip at
+ *
+ * Sets the thermal trip threshold and throttle event of the given sensor
+ * group. If this threshold is crossed, the hardware will trigger the
+ * throttle.
+ *
+ * Note that, although @trip_temp is specified in millicelsius, the
+ * hardware is programmed in degrees Celsius.
+ *
+ * Return: 0 upon success, or %-EINVAL upon failure.
+ */
+static int throttrip_program(struct device *dev,
+			     const struct tegra_tsensor_group *sg,
+			     struct soctherm_throt_cfg *stc,
+			     int trip_temp)
+{
+	struct tegra_soctherm *ts = dev_get_drvdata(dev);
+	int temp, cpu_throt, gpu_throt;
+	unsigned int throt;
+	u32 r, reg_off;
+
+	if (!dev || !sg || !stc || !stc->init)
+		return -EINVAL;
+
+	temp = enforce_temp_range(dev, trip_temp) / ts->soc->thresh_grain;
+
+	/* Hardcode LIGHT on LEVEL1 and HEAVY on LEVEL2 */
+	throt = stc->id;
+	reg_off = THERMCTL_LVL_REG(sg->thermctl_lvl0_offset, throt + 1);
+
+	if (throt == THROTTLE_LIGHT) {
+		cpu_throt = THERMCTL_LVL0_CPU0_CPU_THROT_LIGHT;
+		gpu_throt = THERMCTL_LVL0_CPU0_GPU_THROT_LIGHT;
+	} else {
+		cpu_throt = THERMCTL_LVL0_CPU0_CPU_THROT_HEAVY;
+		gpu_throt = THERMCTL_LVL0_CPU0_GPU_THROT_HEAVY;
+		if (throt != THROTTLE_HEAVY)
+			dev_warn(dev,
+				 "invalid throt id %d - assuming HEAVY",
+				 throt);
+	}
+
+	r = readl(ts->regs + reg_off);
+	r = REG_SET_MASK(r, sg->thermctl_lvl0_up_thresh_mask, temp);
+	r = REG_SET_MASK(r, sg->thermctl_lvl0_dn_thresh_mask, temp);
+	r = REG_SET_MASK(r, THERMCTL_LVL0_CPU0_CPU_THROT_MASK, cpu_throt);
+	r = REG_SET_MASK(r, THERMCTL_LVL0_CPU0_GPU_THROT_MASK, gpu_throt);
+	r = REG_SET_MASK(r, THERMCTL_LVL0_CPU0_EN_MASK, 1);
+	writel(r, ts->regs + reg_off);
+
+	return 0;
+}
+
+static struct soctherm_throt_cfg *
+find_throttle_cfg_by_name(struct tegra_soctherm *ts, const char *name)
+{
+	unsigned int i;
+
+	for (i = 0; ts->throt_cfgs[i].name; i++)
+		if (!strcmp(ts->throt_cfgs[i].name, name))
+			return &ts->throt_cfgs[i];
+
+	return NULL;
+}
+
+static int get_hot_temp(struct thermal_zone_device *tz, int *trip, int *temp)
+{
+	int ntrips, i, ret;
+	enum thermal_trip_type type;
+
+	ntrips = of_thermal_get_ntrips(tz);
+	if (ntrips <= 0)
+		return -EINVAL;
+
+	for (i = 0; i < ntrips; i++) {
+		ret = tz->ops->get_trip_type(tz, i, &type);
+		if (ret)
+			return -EINVAL;
+		if (type == THERMAL_TRIP_HOT) {
+			ret = tz->ops->get_trip_temp(tz, i, temp);
+			if (!ret)
+				*trip = i;
+
+			return ret;
+		}
+	}
+
+	return -EINVAL;
+}
+
+/**
  * tegra_soctherm_set_hwtrips() - set HW trip point from DT data
  * @dev: struct device * of the SOC_THERM instance
  *
  * Configure the SOC_THERM HW trip points, setting "THERMTRIP"
- * trip points , using "critical" type trip_temp from thermal
- * zone.
- * After they have been configured, THERMTRIP will take action
- * when the configured SoC thermal sensor group reaches a
+ * "THROTTLE" trip points , using "critical" or "hot" type trip_temp
+ * from thermal zone.
+ * After they have been configured, THERMTRIP or THROTTLE will take
+ * action when the configured SoC thermal sensor group reaches a
  * certain temperature.
  *
  * Return: 0 upon success, or a negative error code on failure.
@@ -254,19 +569,24 @@ static int thermtrip_program(struct device *dev,
  * THERMTRIP has been enabled successfully when a message similar to
  * this one appears on the serial console:
  * "thermtrip: will shut down when sensor group XXX reaches YYYYYY mC"
+ * THROTTLE has been enabled successfully when a message similar to
+ * this one appears on the serial console:
+ * ""throttrip: will throttle when sensor group XXX reaches YYYYYY mC"
  */
 static int tegra_soctherm_set_hwtrips(struct device *dev,
 				      const struct tegra_tsensor_group *sg,
 				      struct thermal_zone_device *tz)
 {
-	int temperature;
+	struct tegra_soctherm *ts = dev_get_drvdata(dev);
+	struct soctherm_throt_cfg *stc;
+	int i, trip, temperature;
 	int ret;
 
 	ret = tz->ops->get_crit_temp(tz, &temperature);
 	if (ret) {
 		dev_warn(dev, "thermtrip: %s: missing critical temperature\n",
 			 sg->name);
-		return ret;
+		goto set_throttle;
 	}
 
 	ret = thermtrip_program(dev, sg, temperature);
@@ -280,6 +600,43 @@ static int tegra_soctherm_set_hwtrips(struct device *dev,
 		 "thermtrip: will shut down when %s reaches %d mC\n",
 		 sg->name, temperature);
 
+set_throttle:
+	ret = get_hot_temp(tz, &trip, &temperature);
+	if (ret) {
+		dev_warn(dev, "throttrip: %s: missing hot temperature\n",
+			 sg->name);
+		return 0;
+	}
+
+	for (i = 0; i < THROTTLE_SIZE; i++) {
+		struct thermal_cooling_device *cdev;
+
+		if (!ts->throt_cfgs[i].init)
+			continue;
+
+		cdev = ts->throt_cfgs[i].cdev;
+		if (get_thermal_instance(tz, cdev, trip))
+			stc = find_throttle_cfg_by_name(ts, cdev->type);
+		else
+			continue;
+
+		ret = throttrip_program(dev, sg, stc, temperature);
+		if (ret) {
+			dev_err(dev, "throttrip: %s: error during enable\n",
+				sg->name);
+			return ret;
+		}
+
+		dev_info(dev,
+			 "throttrip: will throttle when %s reaches %d mC\n",
+			 sg->name, temperature);
+		break;
+	}
+
+	if (i == THROTTLE_SIZE)
+		dev_warn(dev, "throttrip: %s: missing throttle cdev\n",
+			 sg->name);
+
 	return 0;
 }
 
@@ -291,7 +648,7 @@ static int regs_show(struct seq_file *s, void *data)
 	const struct tegra_tsensor *tsensors = ts->soc->tsensors;
 	const struct tegra_tsensor_group **ttgs = ts->soc->ttgs;
 	u32 r, state;
-	int i;
+	int i, level;
 
 	seq_puts(s, "-----TSENSE (convert HW)-----\n");
 
@@ -365,6 +722,81 @@ static int regs_show(struct seq_file *s, void *data)
 	state = REG_GET_MASK(r, SENSOR_TEMP2_MEM_TEMP_MASK);
 	seq_printf(s, " MEM(%d)\n", translate_temp(state));
 
+	for (i = 0; i < ts->soc->num_ttgs; i++) {
+		seq_printf(s, "%s:\n", ttgs[i]->name);
+		for (level = 0; level < 4; level++) {
+			s32 v;
+			u32 mask;
+			u16 off = ttgs[i]->thermctl_lvl0_offset;
+
+			r = readl(ts->regs + THERMCTL_LVL_REG(off, level));
+
+			mask = ttgs[i]->thermctl_lvl0_up_thresh_mask;
+			state = REG_GET_MASK(r, mask);
+			v = sign_extend32(state, ts->soc->bptt - 1);
+			v *= ts->soc->thresh_grain;
+			seq_printf(s, "   %d: Up/Dn(%d /", level, v);
+
+			mask = ttgs[i]->thermctl_lvl0_dn_thresh_mask;
+			state = REG_GET_MASK(r, mask);
+			v = sign_extend32(state, ts->soc->bptt - 1);
+			v *= ts->soc->thresh_grain;
+			seq_printf(s, "%d ) ", v);
+
+			mask = THERMCTL_LVL0_CPU0_EN_MASK;
+			state = REG_GET_MASK(r, mask);
+			seq_printf(s, "En(%d) ", state);
+
+			mask = THERMCTL_LVL0_CPU0_CPU_THROT_MASK;
+			state = REG_GET_MASK(r, mask);
+			seq_puts(s, "CPU Throt");
+			if (!state)
+				seq_printf(s, "(%s) ", "none");
+			else if (state == THERMCTL_LVL0_CPU0_CPU_THROT_LIGHT)
+				seq_printf(s, "(%s) ", "L");
+			else if (state == THERMCTL_LVL0_CPU0_CPU_THROT_HEAVY)
+				seq_printf(s, "(%s) ", "H");
+			else
+				seq_printf(s, "(%s) ", "H+L");
+
+			mask = THERMCTL_LVL0_CPU0_GPU_THROT_MASK;
+			state = REG_GET_MASK(r, mask);
+			seq_puts(s, "GPU Throt");
+			if (!state)
+				seq_printf(s, "(%s) ", "none");
+			else if (state == THERMCTL_LVL0_CPU0_GPU_THROT_LIGHT)
+				seq_printf(s, "(%s) ", "L");
+			else if (state == THERMCTL_LVL0_CPU0_GPU_THROT_HEAVY)
+				seq_printf(s, "(%s) ", "H");
+			else
+				seq_printf(s, "(%s) ", "H+L");
+
+			mask = THERMCTL_LVL0_CPU0_STATUS_MASK;
+			state = REG_GET_MASK(r, mask);
+			seq_printf(s, "Status(%s)\n",
+				   state == 0 ? "LO" :
+				   state == 1 ? "In" :
+				   state == 2 ? "Res" : "HI");
+		}
+	}
+
+	r = readl(ts->regs + THERMCTL_STATS_CTL);
+	seq_printf(s, "STATS: Up(%s) Dn(%s)\n",
+		   r & STATS_CTL_EN_UP ? "En" : "--",
+		   r & STATS_CTL_EN_DN ? "En" : "--");
+
+	for (level = 0; level < 4; level++) {
+		u16 off;
+
+		off = THERMCTL_LVL0_UP_STATS;
+		r = readl(ts->regs + THERMCTL_LVL_REG(off, level));
+		seq_printf(s, "  Level_%d Up(%d) ", level, r);
+
+		off = THERMCTL_LVL0_DN_STATS;
+		r = readl(ts->regs + THERMCTL_LVL_REG(off, level));
+		seq_printf(s, "Dn(%d)\n", r);
+	}
+
 	r = readl(ts->regs + THERMCTL_THERMTRIP_CTL);
 	state = REG_GET_MASK(r, ttgs[0]->thermtrip_any_en_mask);
 	seq_printf(s, "Thermtrip Any En(%d)\n", state);
@@ -376,6 +808,32 @@ static int regs_show(struct seq_file *s, void *data)
 		seq_printf(s, "Thresh(%d)\n", state);
 	}
 
+	r = readl(ts->regs + THROT_GLOBAL_CFG);
+	seq_puts(s, "\n");
+	seq_printf(s, "GLOBAL THROTTLE CONFIG: 0x%08x\n", r);
+
+	seq_puts(s, "---------------------------------------------------\n");
+	r = readl(ts->regs + THROT_STATUS);
+	state = REG_GET_MASK(r, THROT_STATUS_BREACH_MASK);
+	seq_printf(s, "THROT STATUS: breach(%d) ", state);
+	state = REG_GET_MASK(r, THROT_STATUS_STATE_MASK);
+	seq_printf(s, "state(%d) ", state);
+	state = REG_GET_MASK(r, THROT_STATUS_ENABLED_MASK);
+	seq_printf(s, "enabled(%d)\n", state);
+
+	r = readl(ts->regs + CPU_PSKIP_STATUS);
+	if (ts->soc->use_ccroc) {
+		state = REG_GET_MASK(r, XPU_PSKIP_STATUS_ENABLED_MASK);
+		seq_printf(s, "CPU PSKIP STATUS: enabled(%d)\n", state);
+	} else {
+		state = REG_GET_MASK(r, XPU_PSKIP_STATUS_M_MASK);
+		seq_printf(s, "CPU PSKIP STATUS: M(%d) ", state);
+		state = REG_GET_MASK(r, XPU_PSKIP_STATUS_N_MASK);
+		seq_printf(s, "N(%d) ", state);
+		state = REG_GET_MASK(r, XPU_PSKIP_STATUS_ENABLED_MASK);
+		seq_printf(s, "enabled(%d)\n", state);
+	}
+
 	return 0;
 }
 
@@ -449,6 +907,326 @@ static int soctherm_clk_enable(struct platform_device *pdev, bool enable)
 	return 0;
 }
 
+static int throt_get_cdev_max_state(struct thermal_cooling_device *cdev,
+				    unsigned long *max_state)
+{
+	*max_state = 1;
+	return 0;
+}
+
+static int throt_get_cdev_cur_state(struct thermal_cooling_device *cdev,
+				    unsigned long *cur_state)
+{
+	struct tegra_soctherm *ts = cdev->devdata;
+	u32 r;
+
+	r = readl(ts->regs + THROT_STATUS);
+	if (REG_GET_MASK(r, THROT_STATUS_STATE_MASK))
+		*cur_state = 1;
+	else
+		*cur_state = 0;
+
+	return 0;
+}
+
+static int throt_set_cdev_state(struct thermal_cooling_device *cdev,
+				unsigned long cur_state)
+{
+	return 0;
+}
+
+static struct thermal_cooling_device_ops throt_cooling_ops = {
+	.get_max_state = throt_get_cdev_max_state,
+	.get_cur_state = throt_get_cdev_cur_state,
+	.set_cur_state = throt_set_cdev_state,
+};
+
+/**
+ * soctherm_init_hw_throt_cdev() - Parse the HW throttle configurations
+ * and register them as cooling devices.
+ */
+static void soctherm_init_hw_throt_cdev(struct platform_device *pdev)
+{
+	struct device *dev = &pdev->dev;
+	struct tegra_soctherm *ts = dev_get_drvdata(dev);
+	struct device_node *np_stc, *np_stcc;
+	const char *name;
+	u32 val;
+	int i, r;
+
+	for (i = 0; i < THROTTLE_SIZE; i++) {
+		ts->throt_cfgs[i].name = throt_names[i];
+		ts->throt_cfgs[i].id = i;
+		ts->throt_cfgs[i].init = false;
+	}
+
+	np_stc = of_get_child_by_name(dev->of_node, "throttle-cfgs");
+	if (!np_stc) {
+		dev_info(dev,
+			 "throttle-cfg: no throttle-cfgs - not enabling\n");
+		return;
+	}
+
+	for_each_child_of_node(np_stc, np_stcc) {
+		struct soctherm_throt_cfg *stc;
+		struct thermal_cooling_device *tcd;
+
+		name = np_stcc->name;
+		stc = find_throttle_cfg_by_name(ts, name);
+		if (!stc) {
+			dev_err(dev,
+				"throttle-cfg: could not find %s\n", name);
+			continue;
+		}
+
+		r = of_property_read_u32(np_stcc, "nvidia,priority", &val);
+		if (r) {
+			dev_info(dev,
+				 "throttle-cfg: %s: missing priority\n", name);
+			continue;
+		}
+		stc->priority = val;
+
+		if (ts->soc->use_ccroc) {
+			r = of_property_read_u32(np_stcc,
+						 "nvidia,cpu-throt-level",
+						 &val);
+			if (r) {
+				dev_info(dev,
+					 "throttle-cfg: %s: missing cpu-throt-level\n",
+					 name);
+				continue;
+			}
+			stc->cpu_throt_level = val;
+		} else {
+			r = of_property_read_u32(np_stcc,
+						 "nvidia,cpu-throt-percent",
+						 &val);
+			if (r) {
+				dev_info(dev,
+					 "throttle-cfg: %s: missing cpu-throt-percent\n",
+					 name);
+				continue;
+			}
+			stc->cpu_throt_depth = val;
+		}
+
+		tcd = thermal_of_cooling_device_register(np_stcc,
+							 (char *)name, ts,
+							 &throt_cooling_ops);
+		of_node_put(np_stcc);
+		if (IS_ERR_OR_NULL(tcd)) {
+			dev_err(dev,
+				"throttle-cfg: %s: failed to register cooling device\n",
+				name);
+			continue;
+		}
+
+		stc->cdev = tcd;
+		stc->init = true;
+	}
+
+	of_node_put(np_stc);
+}
+
+/**
+ * throttlectl_cpu_level_cfg() - programs CCROC NV_THERM level config
+ * @level: describing the level LOW/MED/HIGH of throttling
+ *
+ * It's necessary to set up the CPU-local CCROC NV_THERM instance with
+ * the M/N values desired for each level. This function does this.
+ *
+ * This function pre-programs the CCROC NV_THERM levels in terms of
+ * pre-configured "Low", "Medium" or "Heavy" throttle levels which are
+ * mapped to THROT_LEVEL_LOW, THROT_LEVEL_MED and THROT_LEVEL_HVY.
+ */
+static void throttlectl_cpu_level_cfg(struct tegra_soctherm *ts, int level)
+{
+	u8 depth, dividend;
+	u32 r;
+
+	switch (level) {
+	case TEGRA_SOCTHERM_THROT_LEVEL_LOW:
+		depth = 50;
+		break;
+	case TEGRA_SOCTHERM_THROT_LEVEL_MED:
+		depth = 75;
+		break;
+	case TEGRA_SOCTHERM_THROT_LEVEL_HIGH:
+		depth = 80;
+		break;
+	case TEGRA_SOCTHERM_THROT_LEVEL_NONE:
+		return;
+	default:
+		return;
+	}
+
+	dividend = THROT_DEPTH_DIVIDEND(depth);
+
+	/* setup PSKIP in ccroc nv_therm registers */
+	r = ccroc_readl(ts, CCROC_THROT_PSKIP_RAMP_CPU_REG(level));
+	r = REG_SET_MASK(r, CCROC_THROT_PSKIP_RAMP_DURATION_MASK, 0xff);
+	r = REG_SET_MASK(r, CCROC_THROT_PSKIP_RAMP_STEP_MASK, 0xf);
+	ccroc_writel(ts, r, CCROC_THROT_PSKIP_RAMP_CPU_REG(level));
+
+	r = ccroc_readl(ts, CCROC_THROT_PSKIP_CTRL_CPU_REG(level));
+	r = REG_SET_MASK(r, CCROC_THROT_PSKIP_CTRL_ENB_MASK, 1);
+	r = REG_SET_MASK(r, CCROC_THROT_PSKIP_CTRL_DIVIDEND_MASK, dividend);
+	r = REG_SET_MASK(r, CCROC_THROT_PSKIP_CTRL_DIVISOR_MASK, 0xff);
+	ccroc_writel(ts, r, CCROC_THROT_PSKIP_CTRL_CPU_REG(level));
+}
+
+/**
+ * throttlectl_cpu_level_select() - program CPU pulse skipper config
+ * @throt: the LIGHT/HEAVY of throttle event id
+ *
+ * Pulse skippers are used to throttle clock frequencies.  This
+ * function programs the pulse skippers based on @throt and platform
+ * data.  This function is used on SoCs which have CPU-local pulse
+ * skipper control, such as T13x. It programs soctherm's interface to
+ * Denver:CCROC NV_THERM in terms of Low, Medium and HIGH throttling
+ * vectors. PSKIP_BYPASS mode is set as required per HW spec.
+ */
+static void throttlectl_cpu_level_select(struct tegra_soctherm *ts,
+					 enum soctherm_throttle_id throt)
+{
+	u32 r, throt_vect;
+
+	/* Denver:CCROC NV_THERM interface N:3 Mapping */
+	switch (ts->throt_cfgs[throt].cpu_throt_level) {
+	case TEGRA_SOCTHERM_THROT_LEVEL_LOW:
+		throt_vect = THROT_VECT_LOW;
+		break;
+	case TEGRA_SOCTHERM_THROT_LEVEL_MED:
+		throt_vect = THROT_VECT_MED;
+		break;
+	case TEGRA_SOCTHERM_THROT_LEVEL_HIGH:
+		throt_vect = THROT_VECT_HIGH;
+		break;
+	default:
+		throt_vect = THROT_VECT_NONE;
+		break;
+	}
+
+	r = readl(ts->regs + THROT_PSKIP_CTRL(throt, THROTTLE_DEV_CPU));
+	r = REG_SET_MASK(r, THROT_PSKIP_CTRL_ENABLE_MASK, 1);
+	r = REG_SET_MASK(r, THROT_PSKIP_CTRL_VECT_CPU_MASK, throt_vect);
+	r = REG_SET_MASK(r, THROT_PSKIP_CTRL_VECT2_CPU_MASK, throt_vect);
+	writel(r, ts->regs + THROT_PSKIP_CTRL(throt, THROTTLE_DEV_CPU));
+
+	/* bypass sequencer in soc_therm as it is programmed in ccroc */
+	r = REG_SET_MASK(0, THROT_PSKIP_RAMP_SEQ_BYPASS_MODE_MASK, 1);
+	writel(r, ts->regs + THROT_PSKIP_RAMP(throt, THROTTLE_DEV_CPU));
+}
+
+/**
+ * throttlectl_cpu_mn() - program CPU pulse skipper configuration
+ * @throt: the LIGHT/HEAVY of throttle event id
+ *
+ * Pulse skippers are used to throttle clock frequencies.  This
+ * function programs the pulse skippers based on @throt and platform
+ * data.  This function is used for CPUs that have "remote" pulse
+ * skipper control, e.g., the CPU pulse skipper is controlled by the
+ * SOC_THERM IP block.  (SOC_THERM is located outside the CPU
+ * complex.)
+ */
+static void throttlectl_cpu_mn(struct tegra_soctherm *ts,
+			       enum soctherm_throttle_id throt)
+{
+	u32 r;
+	int depth;
+	u8 dividend;
+
+	depth = ts->throt_cfgs[throt].cpu_throt_depth;
+	dividend = THROT_DEPTH_DIVIDEND(depth);
+
+	r = readl(ts->regs + THROT_PSKIP_CTRL(throt, THROTTLE_DEV_CPU));
+	r = REG_SET_MASK(r, THROT_PSKIP_CTRL_ENABLE_MASK, 1);
+	r = REG_SET_MASK(r, THROT_PSKIP_CTRL_DIVIDEND_MASK, dividend);
+	r = REG_SET_MASK(r, THROT_PSKIP_CTRL_DIVISOR_MASK, 0xff);
+	writel(r, ts->regs + THROT_PSKIP_CTRL(throt, THROTTLE_DEV_CPU));
+
+	r = readl(ts->regs + THROT_PSKIP_RAMP(throt, THROTTLE_DEV_CPU));
+	r = REG_SET_MASK(r, THROT_PSKIP_RAMP_DURATION_MASK, 0xff);
+	r = REG_SET_MASK(r, THROT_PSKIP_RAMP_STEP_MASK, 0xf);
+	writel(r, ts->regs + THROT_PSKIP_RAMP(throt, THROTTLE_DEV_CPU));
+}
+
+/**
+ * soctherm_throttle_program() - programs pulse skippers' configuration
+ * @throt: the LIGHT/HEAVY of the throttle event id.
+ *
+ * Pulse skippers are used to throttle clock frequencies.
+ * This function programs the pulse skippers.
+ */
+static void soctherm_throttle_program(struct tegra_soctherm *ts,
+				      enum soctherm_throttle_id throt)
+{
+	u32 r;
+	struct soctherm_throt_cfg stc = ts->throt_cfgs[throt];
+
+	if (!stc.init)
+		return;
+
+	/* Setup PSKIP parameters */
+	if (ts->soc->use_ccroc)
+		throttlectl_cpu_level_select(ts, throt);
+	else
+		throttlectl_cpu_mn(ts, throt);
+
+	r = REG_SET_MASK(0, THROT_PRIORITY_LITE_PRIO_MASK, stc.priority);
+	writel(r, ts->regs + THROT_PRIORITY_CTRL(throt));
+
+	r = REG_SET_MASK(0, THROT_DELAY_LITE_DELAY_MASK, 0);
+	writel(r, ts->regs + THROT_DELAY_CTRL(throt));
+
+	r = readl(ts->regs + THROT_PRIORITY_LOCK);
+	r = REG_GET_MASK(r, THROT_PRIORITY_LOCK_PRIORITY_MASK);
+	if (r >= stc.priority)
+		return;
+	r = REG_SET_MASK(0, THROT_PRIORITY_LOCK_PRIORITY_MASK,
+			 stc.priority);
+	writel(r, ts->regs + THROT_PRIORITY_LOCK);
+}
+
+static void tegra_soctherm_throttle(struct device *dev)
+{
+	struct tegra_soctherm *ts = dev_get_drvdata(dev);
+	u32 v;
+	int i;
+
+	/* configure LOW, MED and HIGH levels for CCROC NV_THERM */
+	if (ts->soc->use_ccroc) {
+		throttlectl_cpu_level_cfg(ts, TEGRA_SOCTHERM_THROT_LEVEL_LOW);
+		throttlectl_cpu_level_cfg(ts, TEGRA_SOCTHERM_THROT_LEVEL_MED);
+		throttlectl_cpu_level_cfg(ts, TEGRA_SOCTHERM_THROT_LEVEL_HIGH);
+	}
+
+	/* Thermal HW throttle programming */
+	for (i = 0; i < THROTTLE_SIZE; i++)
+		soctherm_throttle_program(ts, i);
+
+	v = REG_SET_MASK(0, THROT_GLOBAL_ENB_MASK, 1);
+	if (ts->soc->use_ccroc) {
+		ccroc_writel(ts, v, CCROC_GLOBAL_CFG);
+
+		v = ccroc_readl(ts, CCROC_SUPER_CCLKG_DIVIDER);
+		v = REG_SET_MASK(v, CDIVG_USE_THERM_CONTROLS_MASK, 1);
+		ccroc_writel(ts, v, CCROC_SUPER_CCLKG_DIVIDER);
+	} else {
+		writel(v, ts->regs + THROT_GLOBAL_CFG);
+
+		v = clk_readl(ts, CAR_SUPER_CCLKG_DIVIDER);
+		v = REG_SET_MASK(v, CDIVG_USE_THERM_CONTROLS_MASK, 1);
+		clk_writel(ts, v, CAR_SUPER_CCLKG_DIVIDER);
+	}
+
+	/* initialize stats collection */
+	v = STATS_CTL_CLR_DN | STATS_CTL_EN_DN |
+	    STATS_CTL_CLR_UP | STATS_CTL_EN_UP;
+	writel(v, ts->regs + THERMCTL_STATS_CTL);
+}
+
 static void soctherm_init(struct platform_device *pdev)
 {
 	struct tegra_soctherm *tegra = platform_get_drvdata(pdev);
@@ -475,6 +1253,9 @@ static void soctherm_init(struct platform_device *pdev)
 	}
 	writel(pdiv, tegra->regs + SENSOR_PDIV);
 	writel(hotspot, tegra->regs + SENSOR_HOTSPOT_OFF);
+
+	/* Configure hw throttle */
+	tegra_soctherm_throttle(&pdev->dev);
 }
 
 static const struct of_device_id tegra_soctherm_of_match[] = {
@@ -527,10 +1308,31 @@ static int tegra_soctherm_probe(struct platform_device *pdev)
 
 	tegra->soc = soc;
 
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
+	res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
+					   "soctherm-reg");
 	tegra->regs = devm_ioremap_resource(&pdev->dev, res);
-	if (IS_ERR(tegra->regs))
+	if (IS_ERR(tegra->regs)) {
+		dev_err(&pdev->dev, "can't get soctherm registers");
 		return PTR_ERR(tegra->regs);
+	}
+
+	if (!tegra->soc->use_ccroc) {
+		res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
+						   "car-reg");
+		tegra->clk_regs = devm_ioremap_resource(&pdev->dev, res);
+		if (IS_ERR(tegra->clk_regs)) {
+			dev_err(&pdev->dev, "can't get car clk registers");
+			return PTR_ERR(tegra->clk_regs);
+		}
+	} else {
+		res = platform_get_resource_byname(pdev, IORESOURCE_MEM,
+						   "ccroc-reg");
+		tegra->ccroc_regs = devm_ioremap_resource(&pdev->dev, res);
+		if (IS_ERR(tegra->ccroc_regs)) {
+			dev_err(&pdev->dev, "can't get ccroc registers");
+			return PTR_ERR(tegra->ccroc_regs);
+		}
+	}
 
 	tegra->reset = devm_reset_control_get(&pdev->dev, "soctherm");
 	if (IS_ERR(tegra->reset)) {
@@ -580,6 +1382,8 @@ static int tegra_soctherm_probe(struct platform_device *pdev)
 	if (err)
 		return err;
 
+	soctherm_init_hw_throt_cdev(pdev);
+
 	soctherm_init(pdev);
 
 	for (i = 0; i < soc->num_ttgs; ++i) {
@@ -593,6 +1397,7 @@ static int tegra_soctherm_probe(struct platform_device *pdev)
 		zone->reg = tegra->regs + soc->ttgs[i]->sensor_temp_offset;
 		zone->dev = &pdev->dev;
 		zone->sg = soc->ttgs[i];
+		zone->ts = tegra;
 
 		z = devm_thermal_zone_of_sensor_register(&pdev->dev,
 							 soc->ttgs[i]->id, zone,
@@ -608,7 +1413,9 @@ static int tegra_soctherm_probe(struct platform_device *pdev)
 		tegra->thermctl_tzs[soc->ttgs[i]->id] = z;
 
 		/* Configure hw trip points */
-		tegra_soctherm_set_hwtrips(&pdev->dev, soc->ttgs[i], z);
+		err = tegra_soctherm_set_hwtrips(&pdev->dev, soc->ttgs[i], z);
+		if (err)
+			goto disable_clocks;
 	}
 
 	soctherm_debug_init(pdev);
@@ -661,7 +1468,12 @@ static int __maybe_unused soctherm_resume(struct device *dev)
 		struct thermal_zone_device *tz;
 
 		tz = tegra->thermctl_tzs[soc->ttgs[i]->id];
-		tegra_soctherm_set_hwtrips(dev, soc->ttgs[i], tz);
+		err = tegra_soctherm_set_hwtrips(dev, soc->ttgs[i], tz);
+		if (err) {
+			dev_err(&pdev->dev,
+				"Resume failed: set hwtrips failed\n");
+			return err;
+		}
 	}
 
 	return 0;
diff --git a/drivers/thermal/tegra/soctherm.h b/drivers/thermal/tegra/soctherm.h
index 28e18ec4b4c3..e96ca73fd780 100644
--- a/drivers/thermal/tegra/soctherm.h
+++ b/drivers/thermal/tegra/soctherm.h
@@ -15,6 +15,11 @@
 #ifndef __DRIVERS_THERMAL_TEGRA_SOCTHERM_H
 #define __DRIVERS_THERMAL_TEGRA_SOCTHERM_H
 
+#define THERMCTL_LEVEL0_GROUP_CPU               0x0
+#define THERMCTL_LEVEL0_GROUP_GPU		0x4
+#define THERMCTL_LEVEL0_GROUP_MEM		0x8
+#define THERMCTL_LEVEL0_GROUP_TSENSE		0xc
+
 #define SENSOR_CONFIG2                          8
 #define SENSOR_CONFIG2_THERMA_MASK		(0xffff << 16)
 #define SENSOR_CONFIG2_THERMA_SHIFT		16
@@ -65,6 +70,9 @@ struct tegra_tsensor_group {
 	u32 thermtrip_enable_mask;
 	u32 thermtrip_any_en_mask;
 	u32 thermtrip_threshold_mask;
+	u16 thermctl_lvl0_offset;
+	u32 thermctl_lvl0_up_thresh_mask;
+	u32 thermctl_lvl0_dn_thresh_mask;
 };
 
 struct tegra_tsensor_configuration {
@@ -103,6 +111,8 @@ struct tegra_soctherm_soc {
 	const unsigned int num_ttgs;
 	const struct tegra_soctherm_fuse *tfuse;
 	const int thresh_grain;
+	const unsigned int bptt;
+	const bool use_ccroc;
 };
 
 int tegra_calc_shared_calib(const struct tegra_soctherm_fuse *tfuse,
diff --git a/drivers/thermal/tegra/tegra124-soctherm.c b/drivers/thermal/tegra/tegra124-soctherm.c
index beb9d36b9c8a..36768630f78c 100644
--- a/drivers/thermal/tegra/tegra124-soctherm.c
+++ b/drivers/thermal/tegra/tegra124-soctherm.c
@@ -28,7 +28,11 @@
 #define TEGRA124_THERMTRIP_CPU_THRESH_MASK	(0xff << 8)
 #define TEGRA124_THERMTRIP_TSENSE_THRESH_MASK	0xff
 
+#define TEGRA124_THERMCTL_LVL0_UP_THRESH_MASK	(0xff << 17)
+#define TEGRA124_THERMCTL_LVL0_DN_THRESH_MASK	(0xff << 9)
+
 #define TEGRA124_THRESH_GRAIN			1000
+#define TEGRA124_BPTT				8
 
 static const struct tegra_tsensor_configuration tegra124_tsensor_config = {
 	.tall = 16300,
@@ -51,6 +55,9 @@ static const struct tegra_tsensor_group tegra124_tsensor_group_cpu = {
 	.thermtrip_any_en_mask = TEGRA124_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA124_THERMTRIP_CPU_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA124_THERMTRIP_CPU_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_CPU,
+	.thermctl_lvl0_up_thresh_mask = TEGRA124_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA124_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group tegra124_tsensor_group_gpu = {
@@ -66,6 +73,9 @@ static const struct tegra_tsensor_group tegra124_tsensor_group_gpu = {
 	.thermtrip_any_en_mask = TEGRA124_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA124_THERMTRIP_GPU_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA124_THERMTRIP_GPUMEM_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_GPU,
+	.thermctl_lvl0_up_thresh_mask = TEGRA124_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA124_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group tegra124_tsensor_group_pll = {
@@ -79,6 +89,9 @@ static const struct tegra_tsensor_group tegra124_tsensor_group_pll = {
 	.thermtrip_any_en_mask = TEGRA124_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA124_THERMTRIP_TSENSE_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA124_THERMTRIP_TSENSE_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_TSENSE,
+	.thermctl_lvl0_up_thresh_mask = TEGRA124_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA124_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group tegra124_tsensor_group_mem = {
@@ -94,6 +107,9 @@ static const struct tegra_tsensor_group tegra124_tsensor_group_mem = {
 	.thermtrip_any_en_mask = TEGRA124_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA124_THERMTRIP_MEM_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA124_THERMTRIP_GPUMEM_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_MEM,
+	.thermctl_lvl0_up_thresh_mask = TEGRA124_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA124_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group *tegra124_tsensor_groups[] = {
@@ -193,4 +209,6 @@ const struct tegra_soctherm_soc tegra124_soctherm = {
 	.num_ttgs = ARRAY_SIZE(tegra124_tsensor_groups),
 	.tfuse = &tegra124_soctherm_fuse,
 	.thresh_grain = TEGRA124_THRESH_GRAIN,
+	.bptt = TEGRA124_BPTT,
+	.use_ccroc = false,
 };
diff --git a/drivers/thermal/tegra/tegra132-soctherm.c b/drivers/thermal/tegra/tegra132-soctherm.c
index e2aa84e1b307..97fa30501eb1 100644
--- a/drivers/thermal/tegra/tegra132-soctherm.c
+++ b/drivers/thermal/tegra/tegra132-soctherm.c
@@ -28,7 +28,11 @@
 #define TEGRA132_THERMTRIP_CPU_THRESH_MASK	(0xff << 8)
 #define TEGRA132_THERMTRIP_TSENSE_THRESH_MASK	0xff
 
+#define TEGRA132_THERMCTL_LVL0_UP_THRESH_MASK	(0xff << 17)
+#define TEGRA132_THERMCTL_LVL0_DN_THRESH_MASK	(0xff << 9)
+
 #define TEGRA132_THRESH_GRAIN			1000
+#define TEGRA132_BPTT				8
 
 static const struct tegra_tsensor_configuration tegra132_tsensor_config = {
 	.tall = 16300,
@@ -51,6 +55,9 @@ static const struct tegra_tsensor_group tegra132_tsensor_group_cpu = {
 	.thermtrip_any_en_mask = TEGRA132_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA132_THERMTRIP_CPU_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA132_THERMTRIP_CPU_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_CPU,
+	.thermctl_lvl0_up_thresh_mask = TEGRA132_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA132_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group tegra132_tsensor_group_gpu = {
@@ -66,6 +73,9 @@ static const struct tegra_tsensor_group tegra132_tsensor_group_gpu = {
 	.thermtrip_any_en_mask = TEGRA132_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA132_THERMTRIP_GPU_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA132_THERMTRIP_GPUMEM_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_GPU,
+	.thermctl_lvl0_up_thresh_mask = TEGRA132_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA132_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group tegra132_tsensor_group_pll = {
@@ -79,6 +89,9 @@ static const struct tegra_tsensor_group tegra132_tsensor_group_pll = {
 	.thermtrip_any_en_mask = TEGRA132_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA132_THERMTRIP_TSENSE_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA132_THERMTRIP_TSENSE_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_TSENSE,
+	.thermctl_lvl0_up_thresh_mask = TEGRA132_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA132_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group tegra132_tsensor_group_mem = {
@@ -94,6 +107,9 @@ static const struct tegra_tsensor_group tegra132_tsensor_group_mem = {
 	.thermtrip_any_en_mask = TEGRA132_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA132_THERMTRIP_MEM_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA132_THERMTRIP_GPUMEM_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_MEM,
+	.thermctl_lvl0_up_thresh_mask = TEGRA132_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA132_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group *tegra132_tsensor_groups[] = {
@@ -193,4 +209,6 @@ const struct tegra_soctherm_soc tegra132_soctherm = {
 	.num_ttgs = ARRAY_SIZE(tegra132_tsensor_groups),
 	.tfuse = &tegra132_soctherm_fuse,
 	.thresh_grain = TEGRA132_THRESH_GRAIN,
+	.bptt = TEGRA132_BPTT,
+	.use_ccroc = true,
 };
diff --git a/drivers/thermal/tegra/tegra210-soctherm.c b/drivers/thermal/tegra/tegra210-soctherm.c
index 19cc0ab66f0e..ad53169a8e95 100644
--- a/drivers/thermal/tegra/tegra210-soctherm.c
+++ b/drivers/thermal/tegra/tegra210-soctherm.c
@@ -29,7 +29,11 @@
 #define TEGRA210_THERMTRIP_CPU_THRESH_MASK	(0x1ff << 9)
 #define TEGRA210_THERMTRIP_TSENSE_THRESH_MASK	0x1ff
 
+#define TEGRA210_THERMCTL_LVL0_UP_THRESH_MASK	(0x1ff << 18)
+#define TEGRA210_THERMCTL_LVL0_DN_THRESH_MASK	(0x1ff << 9)
+
 #define TEGRA210_THRESH_GRAIN			500
+#define TEGRA210_BPTT				9
 
 static const struct tegra_tsensor_configuration tegra210_tsensor_config = {
 	.tall = 16300,
@@ -52,6 +56,9 @@ static const struct tegra_tsensor_group tegra210_tsensor_group_cpu = {
 	.thermtrip_any_en_mask = TEGRA210_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA210_THERMTRIP_CPU_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA210_THERMTRIP_CPU_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_CPU,
+	.thermctl_lvl0_up_thresh_mask = TEGRA210_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA210_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group tegra210_tsensor_group_gpu = {
@@ -67,6 +74,9 @@ static const struct tegra_tsensor_group tegra210_tsensor_group_gpu = {
 	.thermtrip_any_en_mask = TEGRA210_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA210_THERMTRIP_GPU_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA210_THERMTRIP_GPUMEM_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_GPU,
+	.thermctl_lvl0_up_thresh_mask = TEGRA210_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA210_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group tegra210_tsensor_group_pll = {
@@ -80,6 +90,9 @@ static const struct tegra_tsensor_group tegra210_tsensor_group_pll = {
 	.thermtrip_any_en_mask = TEGRA210_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA210_THERMTRIP_TSENSE_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA210_THERMTRIP_TSENSE_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_TSENSE,
+	.thermctl_lvl0_up_thresh_mask = TEGRA210_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA210_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group tegra210_tsensor_group_mem = {
@@ -95,6 +108,9 @@ static const struct tegra_tsensor_group tegra210_tsensor_group_mem = {
 	.thermtrip_any_en_mask = TEGRA210_THERMTRIP_ANY_EN_MASK,
 	.thermtrip_enable_mask = TEGRA210_THERMTRIP_MEM_EN_MASK,
 	.thermtrip_threshold_mask = TEGRA210_THERMTRIP_GPUMEM_THRESH_MASK,
+	.thermctl_lvl0_offset = THERMCTL_LEVEL0_GROUP_MEM,
+	.thermctl_lvl0_up_thresh_mask = TEGRA210_THERMCTL_LVL0_UP_THRESH_MASK,
+	.thermctl_lvl0_dn_thresh_mask = TEGRA210_THERMCTL_LVL0_DN_THRESH_MASK,
 };
 
 static const struct tegra_tsensor_group *tegra210_tsensor_groups[] = {
@@ -194,4 +210,6 @@ const struct tegra_soctherm_soc tegra210_soctherm = {
 	.num_ttgs = ARRAY_SIZE(tegra210_tsensor_groups),
 	.tfuse = &tegra210_soctherm_fuse,
 	.thresh_grain = TEGRA210_THRESH_GRAIN,
+	.bptt = TEGRA210_BPTT,
+	.use_ccroc = false,
 };
diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c
index e2fc6161dded..226b0b4aced6 100644
--- a/drivers/thermal/thermal_core.c
+++ b/drivers/thermal/thermal_core.c
@@ -520,6 +520,56 @@ exit:
 }
 EXPORT_SYMBOL_GPL(thermal_zone_get_temp);
 
+void thermal_zone_set_trips(struct thermal_zone_device *tz)
+{
+	int low = -INT_MAX;
+	int high = INT_MAX;
+	int trip_temp, hysteresis;
+	int i, ret;
+
+	mutex_lock(&tz->lock);
+
+	if (!tz->ops->set_trips || !tz->ops->get_trip_hyst)
+		goto exit;
+
+	for (i = 0; i < tz->trips; i++) {
+		int trip_low;
+
+		tz->ops->get_trip_temp(tz, i, &trip_temp);
+		tz->ops->get_trip_hyst(tz, i, &hysteresis);
+
+		trip_low = trip_temp - hysteresis;
+
+		if (trip_low < tz->temperature && trip_low > low)
+			low = trip_low;
+
+		if (trip_temp > tz->temperature && trip_temp < high)
+			high = trip_temp;
+	}
+
+	/* No need to change trip points */
+	if (tz->prev_low_trip == low && tz->prev_high_trip == high)
+		goto exit;
+
+	tz->prev_low_trip = low;
+	tz->prev_high_trip = high;
+
+	dev_dbg(&tz->device,
+		"new temperature boundaries: %d < x < %d\n", low, high);
+
+	/*
+	 * Set a temperature window. When this window is left the driver
+	 * must inform the thermal core via thermal_zone_device_update.
+	 */
+	ret = tz->ops->set_trips(tz, low, high);
+	if (ret)
+		dev_err(&tz->device, "Failed to set trips: %d\n", ret);
+
+exit:
+	mutex_unlock(&tz->lock);
+}
+EXPORT_SYMBOL_GPL(thermal_zone_set_trips);
+
 static void update_temperature(struct thermal_zone_device *tz)
 {
 	int temp, ret;
@@ -557,7 +607,8 @@ static void thermal_zone_device_reset(struct thermal_zone_device *tz)
 		pos->initialized = false;
 }
 
-void thermal_zone_device_update(struct thermal_zone_device *tz)
+void thermal_zone_device_update(struct thermal_zone_device *tz,
+				enum thermal_notify_event event)
 {
 	int count;
 
@@ -569,6 +620,10 @@ void thermal_zone_device_update(struct thermal_zone_device *tz)
 
 	update_temperature(tz);
 
+	thermal_zone_set_trips(tz);
+
+	tz->notify_event = event;
+
 	for (count = 0; count < tz->trips; count++)
 		handle_thermal_trip(tz, count);
 }
@@ -579,7 +634,7 @@ static void thermal_zone_device_check(struct work_struct *work)
 	struct thermal_zone_device *tz = container_of(work, struct
 						      thermal_zone_device,
 						      poll_queue.work);
-	thermal_zone_device_update(tz);
+	thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
 }
 
 /* sys I/F for thermal zone */
@@ -703,7 +758,7 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr,
 	if (ret)
 		return ret;
 
-	thermal_zone_device_update(tz);
+	thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
 
 	return count;
 }
@@ -754,6 +809,9 @@ trip_point_hyst_store(struct device *dev, struct device_attribute *attr,
 	 */
 	ret = tz->ops->set_trip_hyst(tz, trip, temperature);
 
+	if (!ret)
+		thermal_zone_set_trips(tz);
+
 	return ret ? ret : count;
 }
 
@@ -822,7 +880,7 @@ passive_store(struct device *dev, struct device_attribute *attr,
 
 	tz->forced_passive = state;
 
-	thermal_zone_device_update(tz);
+	thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
 
 	return count;
 }
@@ -913,7 +971,7 @@ emul_temp_store(struct device *dev, struct device_attribute *attr,
 	}
 
 	if (!ret)
-		thermal_zone_device_update(tz);
+		thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
 
 	return ret ? ret : count;
 }
@@ -1509,7 +1567,8 @@ __thermal_cooling_device_register(struct device_node *np,
 	mutex_lock(&thermal_list_lock);
 	list_for_each_entry(pos, &thermal_tz_list, node)
 		if (atomic_cmpxchg(&pos->need_update, 1, 0))
-			thermal_zone_device_update(pos);
+			thermal_zone_device_update(pos,
+						   THERMAL_EVENT_UNSPECIFIED);
 	mutex_unlock(&thermal_list_lock);
 
 	return cdev;
@@ -1952,7 +2011,7 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type,
 	thermal_zone_device_reset(tz);
 	/* Update the new thermal zone and mark it as already updated. */
 	if (atomic_cmpxchg(&tz->need_update, 1, 0))
-		thermal_zone_device_update(tz);
+		thermal_zone_device_update(tz, THERMAL_EVENT_UNSPECIFIED);
 
 	return tz;
 
@@ -2069,6 +2128,36 @@ exit:
 }
 EXPORT_SYMBOL_GPL(thermal_zone_get_zone_by_name);
 
+/**
+ * thermal_zone_get_slope - return the slope attribute of the thermal zone
+ * @tz: thermal zone device with the slope attribute
+ *
+ * Return: If the thermal zone device has a slope attribute, return it, else
+ * return 1.
+ */
+int thermal_zone_get_slope(struct thermal_zone_device *tz)
+{
+	if (tz && tz->tzp)
+		return tz->tzp->slope;
+	return 1;
+}
+EXPORT_SYMBOL_GPL(thermal_zone_get_slope);
+
+/**
+ * thermal_zone_get_offset - return the offset attribute of the thermal zone
+ * @tz: thermal zone device with the offset attribute
+ *
+ * Return: If the thermal zone device has a offset attribute, return it, else
+ * return 0.
+ */
+int thermal_zone_get_offset(struct thermal_zone_device *tz)
+{
+	if (tz && tz->tzp)
+		return tz->tzp->offset;
+	return 0;
+}
+EXPORT_SYMBOL_GPL(thermal_zone_get_offset);
+
 #ifdef CONFIG_NET
 static const struct genl_multicast_group thermal_event_mcgrps[] = {
 	{ .name = THERMAL_GENL_MCAST_GROUP_NAME, },
@@ -2209,7 +2298,8 @@ static int thermal_pm_notify(struct notifier_block *nb,
 		atomic_set(&in_suspend, 0);
 		list_for_each_entry(tz, &thermal_tz_list, node) {
 			thermal_zone_device_reset(tz);
-			thermal_zone_device_update(tz);
+			thermal_zone_device_update(tz,
+						   THERMAL_EVENT_UNSPECIFIED);
 		}
 		break;
 	default:
diff --git a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
index 15c0a9ac2209..0586bd0f2bab 100644
--- a/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
+++ b/drivers/thermal/ti-soc-thermal/ti-thermal-common.c
@@ -52,7 +52,7 @@ static void ti_thermal_work(struct work_struct *work)
 	struct ti_thermal_data *data = container_of(work,
 					struct ti_thermal_data, thermal_wq);
 
-	thermal_zone_device_update(data->ti_thermal);
+	thermal_zone_device_update(data->ti_thermal, THERMAL_EVENT_UNSPECIFIED);
 
 	dev_dbg(&data->ti_thermal->device, "updated thermal zone %s\n",
 		data->ti_thermal->type);
@@ -205,7 +205,7 @@ static int ti_thermal_set_mode(struct thermal_zone_device *thermal,
 	data->mode = mode;
 	ti_bandgap_write_update_interval(bgp, data->sensor_id,
 					data->ti_thermal->polling_delay);
-	thermal_zone_device_update(data->ti_thermal);
+	thermal_zone_device_update(data->ti_thermal, THERMAL_EVENT_UNSPECIFIED);
 	dev_dbg(&thermal->device, "thermal polling set for duration=%d msec\n",
 		data->ti_thermal->polling_delay);
 
@@ -239,7 +239,7 @@ static int ti_thermal_get_trip_temp(struct thermal_zone_device *thermal,
 	return 0;
 }
 
-static int __ti_thermal_get_trend(void *p, long *trend)
+static int __ti_thermal_get_trend(void *p, int trip, enum thermal_trend *trend)
 {
 	struct ti_thermal_data *data = p;
 	struct ti_bandgap *bgp;
@@ -252,22 +252,6 @@ static int __ti_thermal_get_trend(void *p, long *trend)
 	if (ret)
 		return ret;
 
-	*trend = tr;
-
-	return 0;
-}
-
-/* Get the temperature trend callback functions for thermal zone */
-static int ti_thermal_get_trend(struct thermal_zone_device *thermal,
-				int trip, enum thermal_trend *trend)
-{
-	int ret;
-	long tr;
-
-	ret = __ti_thermal_get_trend(thermal->devdata, &tr);
-	if (ret)
-		return ret;
-
 	if (tr > 0)
 		*trend = THERMAL_TREND_RAISING;
 	else if (tr < 0)
@@ -278,6 +262,13 @@ static int ti_thermal_get_trend(struct thermal_zone_device *thermal,
 	return 0;
 }
 
+/* Get the temperature trend callback functions for thermal zone */
+static int ti_thermal_get_trend(struct thermal_zone_device *thermal,
+				int trip, enum thermal_trend *trend)
+{
+	return __ti_thermal_get_trend(thermal->devdata, trip, trend);
+}
+
 /* Get critical temperature callback functions for thermal zone */
 static int ti_thermal_get_crit_temp(struct thermal_zone_device *thermal,
 				    int *temp)
diff --git a/drivers/thermal/user_space.c b/drivers/thermal/user_space.c
index 10adcddc8821..c908150c268d 100644
--- a/drivers/thermal/user_space.c
+++ b/drivers/thermal/user_space.c
@@ -23,19 +23,30 @@
  */
 
 #include <linux/thermal.h>
-
+#include <linux/slab.h>
 #include "thermal_core.h"
 
 /**
  * notify_user_space - Notifies user space about thermal events
  * @tz - thermal_zone_device
+ * @trip - Trip point index
  *
  * This function notifies the user space through UEvents.
  */
 static int notify_user_space(struct thermal_zone_device *tz, int trip)
 {
+	char *thermal_prop[5];
+	int i;
+
 	mutex_lock(&tz->lock);
-	kobject_uevent(&tz->device.kobj, KOBJ_CHANGE);
+	thermal_prop[0] = kasprintf(GFP_KERNEL, "NAME=%s", tz->type);
+	thermal_prop[1] = kasprintf(GFP_KERNEL, "TEMP=%d", tz->temperature);
+	thermal_prop[2] = kasprintf(GFP_KERNEL, "TRIP=%d", trip);
+	thermal_prop[3] = kasprintf(GFP_KERNEL, "EVENT=%d", tz->notify_event);
+	thermal_prop[4] = NULL;
+	kobject_uevent_env(&tz->device.kobj, KOBJ_CHANGE, thermal_prop);
+	for (i = 0; i < 4; ++i)
+		kfree(thermal_prop[i]);
 	mutex_unlock(&tz->lock);
 	return 0;
 }
diff --git a/drivers/thermal/x86_pkg_temp_thermal.c b/drivers/thermal/x86_pkg_temp_thermal.c
index 97f0a2bd93ed..95f4c1bcdb4c 100644
--- a/drivers/thermal/x86_pkg_temp_thermal.c
+++ b/drivers/thermal/x86_pkg_temp_thermal.c
@@ -348,7 +348,8 @@ static void pkg_temp_thermal_threshold_work_fn(struct work_struct *work)
 	}
 	if (notify) {
 		pr_debug("thermal_zone_device_update\n");
-		thermal_zone_device_update(phdev->tzone);
+		thermal_zone_device_update(phdev->tzone,
+					   THERMAL_EVENT_UNSPECIFIED);
 	}
 }
 
diff --git a/drivers/video/fbdev/Kconfig b/drivers/video/fbdev/Kconfig
index 88b008fb8a4e..af2f117208f1 100644
--- a/drivers/video/fbdev/Kconfig
+++ b/drivers/video/fbdev/Kconfig
@@ -284,12 +284,14 @@ config FB_PM2_FIFO_DISCONNECT
 config FB_ARMCLCD
 	tristate "ARM PrimeCell PL110 support"
 	depends on ARM || ARM64 || COMPILE_TEST
-	depends on FB && ARM_AMBA
+	depends on FB && ARM_AMBA && HAS_IOMEM
 	select FB_CFB_FILLRECT
 	select FB_CFB_COPYAREA
 	select FB_CFB_IMAGEBLIT
 	select FB_MODE_HELPERS if OF
 	select VIDEOMODE_HELPERS if OF
+	select BACKLIGHT_LCD_SUPPORT if OF
+	select BACKLIGHT_CLASS_DEVICE if OF
 	help
 	  This framebuffer device driver is for the ARM PrimeCell PL110
 	  Colour LCD controller.  ARM PrimeCells provide the building
@@ -305,6 +307,8 @@ config PLAT_VERSATILE_CLCD
 	def_bool ARCH_VERSATILE || ARCH_REALVIEW || ARCH_VEXPRESS || ARCH_INTEGRATOR
 	depends on ARM
 	depends on FB_ARMCLCD && FB=y
+	select REGMAP
+	select MFD_SYSCON
 
 config FB_ACORN
 	bool "Acorn VIDC support"
@@ -2443,7 +2447,6 @@ config FB_SIMPLE
 
 source "drivers/video/fbdev/omap/Kconfig"
 source "drivers/video/fbdev/omap2/Kconfig"
-source "drivers/video/fbdev/exynos/Kconfig"
 source "drivers/video/fbdev/mmp/Kconfig"
 
 config FB_SH_MOBILE_MERAM
diff --git a/drivers/video/fbdev/Makefile b/drivers/video/fbdev/Makefile
index f6731867dd26..ee8c81405a7f 100644
--- a/drivers/video/fbdev/Makefile
+++ b/drivers/video/fbdev/Makefile
@@ -6,8 +6,6 @@
 
 obj-y				+= core/
 
-obj-$(CONFIG_EXYNOS_VIDEO)     += exynos/
-
 obj-$(CONFIG_FB_MACMODES)      += macmodes.o
 obj-$(CONFIG_FB_WMT_GE_ROPS)   += wmt_ge_rops.o
 
@@ -79,6 +77,7 @@ obj-$(CONFIG_FB_ATMEL)		  += atmel_lcdfb.o
 obj-$(CONFIG_FB_PVR2)             += pvr2fb.o
 obj-$(CONFIG_FB_VOODOO1)          += sstfb.o
 obj-$(CONFIG_FB_ARMCLCD)	  += amba-clcd.o
+obj-$(CONFIG_ARCH_NOMADIK)	  += amba-clcd-nomadik.o
 obj-$(CONFIG_PLAT_VERSATILE_CLCD) += amba-clcd-versatile.o
 obj-$(CONFIG_FB_GOLDFISH)         += goldfishfb.o
 obj-$(CONFIG_FB_68328)            += 68328fb.o
diff --git a/drivers/video/fbdev/amba-clcd-nomadik.c b/drivers/video/fbdev/amba-clcd-nomadik.c
new file mode 100644
index 000000000000..0c06fcaaa6e8
--- /dev/null
+++ b/drivers/video/fbdev/amba-clcd-nomadik.c
@@ -0,0 +1,259 @@
+#include <linux/amba/bus.h>
+#include <linux/amba/clcd.h>
+#include <linux/gpio/consumer.h>
+#include <linux/of.h>
+#include <linux/of_graph.h>
+#include <linux/delay.h>
+#include <linux/bitops.h>
+#include <linux/mfd/syscon.h>
+#include <linux/regmap.h>
+
+#include "amba-clcd-nomadik.h"
+
+static struct gpio_desc *grestb;
+static struct gpio_desc *scen;
+static struct gpio_desc *scl;
+static struct gpio_desc *sda;
+
+static u8 tpg110_readwrite_reg(bool write, u8 address, u8 outval)
+{
+	int i;
+	u8 inval = 0;
+
+	/* Assert SCEN */
+	gpiod_set_value_cansleep(scen, 1);
+	ndelay(150);
+	/* Hammer out the address */
+	for (i = 5; i >= 0; i--) {
+		if (address & BIT(i))
+			gpiod_set_value_cansleep(sda, 1);
+		else
+			gpiod_set_value_cansleep(sda, 0);
+		ndelay(150);
+		/* Send an SCL pulse */
+		gpiod_set_value_cansleep(scl, 1);
+		ndelay(160);
+		gpiod_set_value_cansleep(scl, 0);
+		ndelay(160);
+	}
+
+	if (write) {
+		/* WRITE */
+		gpiod_set_value_cansleep(sda, 0);
+	} else {
+		/* READ */
+		gpiod_set_value_cansleep(sda, 1);
+	}
+	ndelay(150);
+	/* Send an SCL pulse */
+	gpiod_set_value_cansleep(scl, 1);
+	ndelay(160);
+	gpiod_set_value_cansleep(scl, 0);
+	ndelay(160);
+
+	if (!write)
+		/* HiZ turn-around cycle */
+		gpiod_direction_input(sda);
+	ndelay(150);
+	/* Send an SCL pulse */
+	gpiod_set_value_cansleep(scl, 1);
+	ndelay(160);
+	gpiod_set_value_cansleep(scl, 0);
+	ndelay(160);
+
+	/* Hammer in/out the data */
+	for (i = 7; i >= 0; i--) {
+		int value;
+
+		if (write) {
+			value = !!(outval & BIT(i));
+			gpiod_set_value_cansleep(sda, value);
+		} else {
+			value = gpiod_get_value(sda);
+			if (value)
+				inval |= BIT(i);
+		}
+		ndelay(150);
+		/* Send an SCL pulse */
+		gpiod_set_value_cansleep(scl, 1);
+		ndelay(160);
+		gpiod_set_value_cansleep(scl, 0);
+		ndelay(160);
+	}
+
+	gpiod_direction_output(sda, 0);
+	/* Deassert SCEN */
+	gpiod_set_value_cansleep(scen, 0);
+	/* Satisfies SCEN pulse width */
+	udelay(1);
+
+	return inval;
+}
+
+static u8 tpg110_read_reg(u8 address)
+{
+	return tpg110_readwrite_reg(false, address, 0);
+}
+
+static void tpg110_write_reg(u8 address, u8 outval)
+{
+	tpg110_readwrite_reg(true, address, outval);
+}
+
+static void tpg110_startup(struct device *dev)
+{
+	u8 val;
+
+	dev_info(dev, "TPG110 display enable\n");
+	/* De-assert the reset signal */
+	gpiod_set_value_cansleep(grestb, 0);
+	mdelay(1);
+	dev_info(dev, "de-asserted GRESTB\n");
+
+	/* Test display communication */
+	tpg110_write_reg(0x00, 0x55);
+	val = tpg110_read_reg(0x00);
+	if (val == 0x55)
+		dev_info(dev, "passed communication test\n");
+	val = tpg110_read_reg(0x01);
+	dev_info(dev, "TPG110 chip ID: %d version: %d\n",
+		val>>4, val&0x0f);
+
+	/* Show display resolution */
+	val = tpg110_read_reg(0x02);
+	val &= 7;
+	switch (val) {
+	case 0x0:
+		dev_info(dev, "IN 400x240 RGB -> OUT 800x480 RGB (dual scan)");
+		break;
+	case 0x1:
+		dev_info(dev, "IN 480x272 RGB -> OUT 800x480 RGB (dual scan)");
+		break;
+	case 0x4:
+		dev_info(dev, "480x640 RGB");
+		break;
+	case 0x5:
+		dev_info(dev, "480x272 RGB");
+		break;
+	case 0x6:
+		dev_info(dev, "640x480 RGB");
+		break;
+	case 0x7:
+		dev_info(dev, "800x480 RGB");
+		break;
+	default:
+		dev_info(dev, "ILLEGAL RESOLUTION");
+		break;
+	}
+
+	val = tpg110_read_reg(0x03);
+	dev_info(dev, "resolution is controlled by %s\n",
+		(val & BIT(7)) ? "software" : "hardware");
+}
+
+static void tpg110_enable(struct clcd_fb *fb)
+{
+	struct device *dev = &fb->dev->dev;
+	static bool startup;
+	u8 val;
+
+	if (!startup) {
+		tpg110_startup(dev);
+		startup = true;
+	}
+
+	/* Take chip out of standby */
+	val = tpg110_read_reg(0x03);
+	val |= BIT(0);
+	tpg110_write_reg(0x03, val);
+}
+
+static void tpg110_disable(struct clcd_fb *fb)
+{
+	u8 val;
+
+	dev_info(&fb->dev->dev, "TPG110 display disable\n");
+	val = tpg110_read_reg(0x03);
+	/* Put into standby */
+	val &= ~BIT(0);
+	tpg110_write_reg(0x03, val);
+}
+
+static void tpg110_init(struct device *dev, struct device_node *np,
+			struct clcd_board *board)
+{
+	dev_info(dev, "TPG110 display init\n");
+
+	grestb = devm_get_gpiod_from_child(dev, "grestb", &np->fwnode);
+	if (IS_ERR(grestb)) {
+		dev_err(dev, "no GRESTB GPIO\n");
+		return;
+	}
+	/* This asserts the GRESTB signal, putting the display into reset */
+	gpiod_direction_output(grestb, 1);
+
+	scen = devm_get_gpiod_from_child(dev, "scen", &np->fwnode);
+	if (IS_ERR(scen)) {
+		dev_err(dev, "no SCEN GPIO\n");
+		return;
+	}
+	gpiod_direction_output(scen, 0);
+	scl = devm_get_gpiod_from_child(dev, "scl", &np->fwnode);
+	if (IS_ERR(scl)) {
+		dev_err(dev, "no SCL GPIO\n");
+		return;
+	}
+	gpiod_direction_output(scl, 0);
+	sda = devm_get_gpiod_from_child(dev, "sda", &np->fwnode);
+	if (IS_ERR(sda)) {
+		dev_err(dev, "no SDA GPIO\n");
+		return;
+	}
+	gpiod_direction_output(sda, 0);
+	board->enable = tpg110_enable;
+	board->disable = tpg110_disable;
+}
+
+int nomadik_clcd_init_panel(struct clcd_fb *fb,
+			    struct device_node *endpoint)
+{
+	struct device_node *panel;
+
+	panel = of_graph_get_remote_port_parent(endpoint);
+	if (!panel)
+		return -ENODEV;
+
+	if (of_device_is_compatible(panel, "tpo,tpg110"))
+		tpg110_init(&fb->dev->dev, panel, fb->board);
+	else
+		dev_info(&fb->dev->dev, "unknown panel\n");
+
+	/* Unknown panel, fall through */
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nomadik_clcd_init_panel);
+
+#define PMU_CTRL_OFFSET 0x0000
+#define PMU_CTRL_LCDNDIF BIT(26)
+
+int nomadik_clcd_init_board(struct amba_device *adev,
+			    struct clcd_board *board)
+{
+	struct regmap *pmu_regmap;
+
+	dev_info(&adev->dev, "Nomadik CLCD board init\n");
+	pmu_regmap =
+		syscon_regmap_lookup_by_compatible("stericsson,nomadik-pmu");
+	if (IS_ERR(pmu_regmap)) {
+		dev_err(&adev->dev, "could not find PMU syscon regmap\n");
+		return PTR_ERR(pmu_regmap);
+	}
+	regmap_update_bits(pmu_regmap,
+			   PMU_CTRL_OFFSET,
+			   PMU_CTRL_LCDNDIF,
+			   0);
+	dev_info(&adev->dev, "set PMU mux to CLCD mode\n");
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(nomadik_clcd_init_board);
diff --git a/drivers/video/fbdev/amba-clcd-nomadik.h b/drivers/video/fbdev/amba-clcd-nomadik.h
new file mode 100644
index 000000000000..50aa9bda69fd
--- /dev/null
+++ b/drivers/video/fbdev/amba-clcd-nomadik.h
@@ -0,0 +1,24 @@
+#ifndef _AMBA_CLCD_NOMADIK_H
+#define _AMBA_CLCD_NOMADIK_H
+
+#include <linux/amba/bus.h>
+
+#ifdef CONFIG_ARCH_NOMADIK
+int nomadik_clcd_init_board(struct amba_device *adev,
+			     struct clcd_board *board);
+int nomadik_clcd_init_panel(struct clcd_fb *fb,
+			    struct device_node *endpoint);
+#else
+static inline int nomadik_clcd_init_board(struct amba_device *adev,
+					  struct clcd_board *board)
+{
+	return 0;
+}
+static inline int nomadik_clcd_init_panel(struct clcd_fb *fb,
+					  struct device_node *endpoint)
+{
+	return 0;
+}
+#endif
+
+#endif /* inclusion guard */
diff --git a/drivers/video/fbdev/amba-clcd-versatile.c b/drivers/video/fbdev/amba-clcd-versatile.c
index a8a22daa3f9d..19ad8645d93c 100644
--- a/drivers/video/fbdev/amba-clcd-versatile.c
+++ b/drivers/video/fbdev/amba-clcd-versatile.c
@@ -3,6 +3,12 @@
 #include <linux/amba/bus.h>
 #include <linux/amba/clcd.h>
 #include <linux/platform_data/video-clcd-versatile.h>
+#include <linux/of.h>
+#include <linux/of_graph.h>
+#include <linux/regmap.h>
+#include <linux/mfd/syscon.h>
+#include <linux/bitops.h>
+#include "amba-clcd-versatile.h"
 
 static struct clcd_panel vga = {
 	.mode		= {
@@ -178,3 +184,392 @@ void versatile_clcd_remove_dma(struct clcd_fb *fb)
 	dma_free_wc(&fb->dev->dev, fb->fb.fix.smem_len, fb->fb.screen_base,
 		    fb->fb.fix.smem_start);
 }
+
+#ifdef CONFIG_OF
+
+static struct regmap *versatile_syscon_map;
+static struct regmap *versatile_ib2_map;
+
+/*
+ * We detect the different syscon types from the compatible strings.
+ */
+enum versatile_clcd {
+	INTEGRATOR_CLCD_CM,
+	VERSATILE_CLCD,
+	REALVIEW_CLCD_EB,
+	REALVIEW_CLCD_PB1176,
+	REALVIEW_CLCD_PB11MP,
+	REALVIEW_CLCD_PBA8,
+	REALVIEW_CLCD_PBX,
+};
+
+static const struct of_device_id versatile_clcd_of_match[] = {
+	{
+		.compatible = "arm,core-module-integrator",
+		.data = (void *)INTEGRATOR_CLCD_CM,
+	},
+	{
+		.compatible = "arm,versatile-sysreg",
+		.data = (void *)VERSATILE_CLCD,
+	},
+	{
+		.compatible = "arm,realview-eb-syscon",
+		.data = (void *)REALVIEW_CLCD_EB,
+	},
+	{
+		.compatible = "arm,realview-pb1176-syscon",
+		.data = (void *)REALVIEW_CLCD_PB1176,
+	},
+	{
+		.compatible = "arm,realview-pb11mp-syscon",
+		.data = (void *)REALVIEW_CLCD_PB11MP,
+	},
+	{
+		.compatible = "arm,realview-pba8-syscon",
+		.data = (void *)REALVIEW_CLCD_PBA8,
+	},
+	{
+		.compatible = "arm,realview-pbx-syscon",
+		.data = (void *)REALVIEW_CLCD_PBX,
+	},
+	{},
+};
+
+/*
+ * Core module CLCD control on the Integrator/CP, bits
+ * 8 thru 19 of the CM_CONTROL register controls a bunch
+ * of CLCD settings.
+ */
+#define INTEGRATOR_HDR_CTRL_OFFSET	0x0C
+#define INTEGRATOR_CLCD_LCDBIASEN	BIT(8)
+#define INTEGRATOR_CLCD_LCDBIASUP	BIT(9)
+#define INTEGRATOR_CLCD_LCDBIASDN	BIT(10)
+/* Bits 11,12,13 controls the LCD type */
+#define INTEGRATOR_CLCD_LCDMUX_MASK	(BIT(11)|BIT(12)|BIT(13))
+#define INTEGRATOR_CLCD_LCDMUX_LCD24	BIT(11)
+#define INTEGRATOR_CLCD_LCDMUX_VGA565	BIT(12)
+#define INTEGRATOR_CLCD_LCDMUX_SHARP	(BIT(11)|BIT(12))
+#define INTEGRATOR_CLCD_LCDMUX_VGA555	BIT(13)
+#define INTEGRATOR_CLCD_LCDMUX_VGA24	(BIT(11)|BIT(12)|BIT(13))
+#define INTEGRATOR_CLCD_LCD0_EN		BIT(14)
+#define INTEGRATOR_CLCD_LCD1_EN		BIT(15)
+/* R/L flip on Sharp */
+#define INTEGRATOR_CLCD_LCD_STATIC1	BIT(16)
+/* U/D flip on Sharp */
+#define INTEGRATOR_CLCD_LCD_STATIC2	BIT(17)
+/* No connection on Sharp */
+#define INTEGRATOR_CLCD_LCD_STATIC	BIT(18)
+/* 0 = 24bit VGA, 1 = 18bit VGA */
+#define INTEGRATOR_CLCD_LCD_N24BITEN	BIT(19)
+
+#define INTEGRATOR_CLCD_MASK		(INTEGRATOR_CLCD_LCDBIASEN | \
+					 INTEGRATOR_CLCD_LCDBIASUP | \
+					 INTEGRATOR_CLCD_LCDBIASDN | \
+					 INTEGRATOR_CLCD_LCDMUX_MASK | \
+					 INTEGRATOR_CLCD_LCD0_EN | \
+					 INTEGRATOR_CLCD_LCD1_EN | \
+					 INTEGRATOR_CLCD_LCD_STATIC1 | \
+					 INTEGRATOR_CLCD_LCD_STATIC2 | \
+					 INTEGRATOR_CLCD_LCD_STATIC | \
+					 INTEGRATOR_CLCD_LCD_N24BITEN)
+
+static void integrator_clcd_enable(struct clcd_fb *fb)
+{
+	struct fb_var_screeninfo *var = &fb->fb.var;
+	u32 val;
+
+	dev_info(&fb->dev->dev, "enable Integrator CLCD connectors\n");
+
+	/* FIXME: really needed? */
+	val = INTEGRATOR_CLCD_LCD_STATIC1 | INTEGRATOR_CLCD_LCD_STATIC2 |
+		INTEGRATOR_CLCD_LCD0_EN | INTEGRATOR_CLCD_LCD1_EN;
+	if (var->bits_per_pixel <= 8 ||
+	    (var->bits_per_pixel == 16 && var->green.length == 5))
+		/* Pseudocolor, RGB555, BGR555 */
+		val |= INTEGRATOR_CLCD_LCDMUX_VGA555;
+	else if (fb->fb.var.bits_per_pixel <= 16)
+		/* truecolor RGB565 */
+		val |= INTEGRATOR_CLCD_LCDMUX_VGA565;
+	else
+		val = 0; /* no idea for this, don't trust the docs */
+
+	regmap_update_bits(versatile_syscon_map,
+			   INTEGRATOR_HDR_CTRL_OFFSET,
+			   INTEGRATOR_CLCD_MASK,
+			   val);
+}
+
+/*
+ * This configuration register in the Versatile and RealView
+ * family is uniformly present but appears more and more
+ * unutilized starting with the RealView series.
+ */
+#define SYS_CLCD			0x50
+#define SYS_CLCD_MODE_MASK		(BIT(0)|BIT(1))
+#define SYS_CLCD_MODE_888		0
+#define SYS_CLCD_MODE_5551		BIT(0)
+#define SYS_CLCD_MODE_565_R_LSB		BIT(1)
+#define SYS_CLCD_MODE_565_B_LSB		(BIT(0)|BIT(1))
+#define SYS_CLCD_CONNECTOR_MASK		(BIT(2)|BIT(3)|BIT(4)|BIT(5))
+#define SYS_CLCD_NLCDIOON		BIT(2)
+#define SYS_CLCD_VDDPOSSWITCH		BIT(3)
+#define SYS_CLCD_PWR3V5SWITCH		BIT(4)
+#define SYS_CLCD_VDDNEGSWITCH		BIT(5)
+#define SYS_CLCD_TSNSS			BIT(6) /* touchscreen enable */
+#define SYS_CLCD_SSPEXP			BIT(7) /* SSP expansion enable */
+
+/* The Versatile can detect the connected panel type */
+#define SYS_CLCD_CLCDID_MASK		(BIT(8)|BIT(9)|BIT(10)|BIT(11)|BIT(12))
+#define SYS_CLCD_ID_SANYO_3_8		(0x00 << 8)
+#define SYS_CLCD_ID_SHARP_8_4		(0x01 << 8)
+#define SYS_CLCD_ID_EPSON_2_2		(0x02 << 8)
+#define SYS_CLCD_ID_SANYO_2_5		(0x07 << 8)
+#define SYS_CLCD_ID_VGA			(0x1f << 8)
+
+#define SYS_CLCD_TSNDAV			BIT(13) /* data ready from TS */
+
+/* IB2 control register for the Versatile daughterboard */
+#define IB2_CTRL			0x00
+#define IB2_CTRL_LCD_SD			BIT(1) /* 1 = shut down LCD */
+#define IB2_CTRL_LCD_BL_ON		BIT(0)
+#define IB2_CTRL_LCD_MASK		(BIT(0)|BIT(1))
+
+static void versatile_clcd_disable(struct clcd_fb *fb)
+{
+	dev_info(&fb->dev->dev, "disable Versatile CLCD connectors\n");
+	regmap_update_bits(versatile_syscon_map,
+			   SYS_CLCD,
+			   SYS_CLCD_CONNECTOR_MASK,
+			   0);
+
+	/* If we're on an IB2 daughterboard, turn off display */
+	if (versatile_ib2_map) {
+		dev_info(&fb->dev->dev, "disable IB2 display\n");
+		regmap_update_bits(versatile_ib2_map,
+				   IB2_CTRL,
+				   IB2_CTRL_LCD_MASK,
+				   IB2_CTRL_LCD_SD);
+	}
+}
+
+static void versatile_clcd_enable(struct clcd_fb *fb)
+{
+	struct fb_var_screeninfo *var = &fb->fb.var;
+	u32 val = 0;
+
+	dev_info(&fb->dev->dev, "enable Versatile CLCD connectors\n");
+	switch (var->green.length) {
+	case 5:
+		val |= SYS_CLCD_MODE_5551;
+		break;
+	case 6:
+		if (var->red.offset == 0)
+			val |= SYS_CLCD_MODE_565_R_LSB;
+		else
+			val |= SYS_CLCD_MODE_565_B_LSB;
+		break;
+	case 8:
+		val |= SYS_CLCD_MODE_888;
+		break;
+	}
+
+	/* Set up the MUX */
+	regmap_update_bits(versatile_syscon_map,
+			   SYS_CLCD,
+			   SYS_CLCD_MODE_MASK,
+			   val);
+
+	/* Then enable the display */
+	regmap_update_bits(versatile_syscon_map,
+			   SYS_CLCD,
+			   SYS_CLCD_CONNECTOR_MASK,
+			   SYS_CLCD_NLCDIOON | SYS_CLCD_PWR3V5SWITCH);
+
+	/* If we're on an IB2 daughterboard, turn on display */
+	if (versatile_ib2_map) {
+		dev_info(&fb->dev->dev, "enable IB2 display\n");
+		regmap_update_bits(versatile_ib2_map,
+				   IB2_CTRL,
+				   IB2_CTRL_LCD_MASK,
+				   IB2_CTRL_LCD_BL_ON);
+	}
+}
+
+static void versatile_clcd_decode(struct clcd_fb *fb, struct clcd_regs *regs)
+{
+	clcdfb_decode(fb, regs);
+
+	/* Always clear BGR for RGB565: we do the routing externally */
+	if (fb->fb.var.green.length == 6)
+		regs->cntl &= ~CNTL_BGR;
+}
+
+static void realview_clcd_disable(struct clcd_fb *fb)
+{
+	dev_info(&fb->dev->dev, "disable RealView CLCD connectors\n");
+	regmap_update_bits(versatile_syscon_map,
+			   SYS_CLCD,
+			   SYS_CLCD_CONNECTOR_MASK,
+			   0);
+}
+
+static void realview_clcd_enable(struct clcd_fb *fb)
+{
+	dev_info(&fb->dev->dev, "enable RealView CLCD connectors\n");
+	regmap_update_bits(versatile_syscon_map,
+			   SYS_CLCD,
+			   SYS_CLCD_CONNECTOR_MASK,
+			   SYS_CLCD_NLCDIOON | SYS_CLCD_PWR3V5SWITCH);
+}
+
+struct versatile_panel {
+	u32 id;
+	char *compatible;
+	bool ib2;
+};
+
+static const struct versatile_panel versatile_panels[] = {
+	{
+		.id = SYS_CLCD_ID_VGA,
+		.compatible = "VGA",
+	},
+	{
+		.id = SYS_CLCD_ID_SANYO_3_8,
+		.compatible = "sanyo,tm38qv67a02a",
+	},
+	{
+		.id = SYS_CLCD_ID_SHARP_8_4,
+		.compatible = "sharp,lq084v1dg21",
+	},
+	{
+		.id = SYS_CLCD_ID_EPSON_2_2,
+		.compatible = "epson,l2f50113t00",
+	},
+	{
+		.id = SYS_CLCD_ID_SANYO_2_5,
+		.compatible = "sanyo,alr252rgt",
+		.ib2 = true,
+	},
+};
+
+static void versatile_panel_probe(struct device *dev,
+				  struct device_node *endpoint)
+{
+	struct versatile_panel const *vpanel = NULL;
+	struct device_node *panel = NULL;
+	u32 val;
+	int ret;
+	int i;
+
+	/*
+	 * The Versatile CLCD has a panel auto-detection mechanism.
+	 * We use this and look for the compatible panel in the
+	 * device tree.
+	 */
+	ret = regmap_read(versatile_syscon_map, SYS_CLCD, &val);
+	if (ret) {
+		dev_err(dev, "cannot read CLCD syscon register\n");
+		return;
+	}
+	val &= SYS_CLCD_CLCDID_MASK;
+
+	/* First find corresponding panel information */
+	for (i = 0; i < ARRAY_SIZE(versatile_panels); i++) {
+		vpanel = &versatile_panels[i];
+
+		if (val == vpanel->id) {
+			dev_err(dev, "autodetected panel \"%s\"\n",
+				vpanel->compatible);
+			break;
+		}
+	}
+	if (i == ARRAY_SIZE(versatile_panels)) {
+		dev_err(dev, "could not auto-detect panel\n");
+		return;
+	}
+
+	panel = of_graph_get_remote_port_parent(endpoint);
+	if (!panel) {
+		dev_err(dev, "could not locate panel in DT\n");
+		return;
+	}
+	if (!of_device_is_compatible(panel, vpanel->compatible))
+		dev_err(dev, "panel in DT is not compatible with the "
+			"auto-detected panel, continuing anyway\n");
+
+	/*
+	 * If we have a Sanyo 2.5" port
+	 * that we're running on an IB2 and proceed to look for the
+	 * IB2 syscon regmap.
+	 */
+	if (!vpanel->ib2)
+		return;
+
+	versatile_ib2_map = syscon_regmap_lookup_by_compatible(
+		"arm,versatile-ib2-syscon");
+	if (IS_ERR(versatile_ib2_map)) {
+		dev_err(dev, "could not locate IB2 control register\n");
+		versatile_ib2_map = NULL;
+		return;
+	}
+}
+
+int versatile_clcd_init_panel(struct clcd_fb *fb,
+			      struct device_node *endpoint)
+{
+	const struct of_device_id *clcd_id;
+	enum versatile_clcd versatile_clcd_type;
+	struct device_node *np;
+	struct regmap *map;
+	struct device *dev = &fb->dev->dev;
+
+	np = of_find_matching_node_and_match(NULL, versatile_clcd_of_match,
+					     &clcd_id);
+	if (!np) {
+		dev_err(dev, "no Versatile syscon node\n");
+		return -ENODEV;
+	}
+	versatile_clcd_type = (enum versatile_clcd)clcd_id->data;
+
+	map = syscon_node_to_regmap(np);
+	if (IS_ERR(map)) {
+		dev_err(dev, "no Versatile syscon regmap\n");
+		return PTR_ERR(map);
+	}
+
+	switch (versatile_clcd_type) {
+	case INTEGRATOR_CLCD_CM:
+		versatile_syscon_map = map;
+		fb->board->enable = integrator_clcd_enable;
+		/* Override the caps, we have only these */
+		fb->board->caps = CLCD_CAP_5551 | CLCD_CAP_RGB565 |
+			CLCD_CAP_888;
+		dev_info(dev, "set up callbacks for Integrator PL110\n");
+		break;
+	case VERSATILE_CLCD:
+		versatile_syscon_map = map;
+		fb->board->enable = versatile_clcd_enable;
+		fb->board->disable = versatile_clcd_disable;
+		fb->board->decode = versatile_clcd_decode;
+		versatile_panel_probe(dev, endpoint);
+		dev_info(dev, "set up callbacks for Versatile\n");
+		break;
+	case REALVIEW_CLCD_EB:
+	case REALVIEW_CLCD_PB1176:
+	case REALVIEW_CLCD_PB11MP:
+	case REALVIEW_CLCD_PBA8:
+	case REALVIEW_CLCD_PBX:
+		versatile_syscon_map = map;
+		fb->board->enable = realview_clcd_enable;
+		fb->board->disable = realview_clcd_disable;
+		dev_info(dev, "set up callbacks for RealView PL111\n");
+		break;
+	default:
+		dev_info(dev, "unknown Versatile system controller\n");
+		break;
+	}
+
+	return 0;
+}
+EXPORT_SYMBOL_GPL(versatile_clcd_init_panel);
+#endif
diff --git a/drivers/video/fbdev/amba-clcd-versatile.h b/drivers/video/fbdev/amba-clcd-versatile.h
new file mode 100644
index 000000000000..1b14359c2cf6
--- /dev/null
+++ b/drivers/video/fbdev/amba-clcd-versatile.h
@@ -0,0 +1,17 @@
+/*
+ * Special local versatile callbacks
+ */
+#include <linux/of.h>
+#include <linux/amba/bus.h>
+#include <linux/platform_data/video-clcd-versatile.h>
+
+#if defined(CONFIG_PLAT_VERSATILE_CLCD) && defined(CONFIG_OF)
+int versatile_clcd_init_panel(struct clcd_fb *fb,
+			      struct device_node *endpoint);
+#else
+static inline int versatile_clcd_init_panel(struct clcd_fb *fb,
+				struct device_node *endpoint)
+{
+	return 0;
+}
+#endif
diff --git a/drivers/video/fbdev/amba-clcd.c b/drivers/video/fbdev/amba-clcd.c
index 9b158869cb89..ec2671d98abc 100644
--- a/drivers/video/fbdev/amba-clcd.c
+++ b/drivers/video/fbdev/amba-clcd.c
@@ -30,10 +30,14 @@
 #include <linux/of.h>
 #include <linux/of_address.h>
 #include <linux/of_graph.h>
+#include <linux/backlight.h>
 #include <video/display_timing.h>
 #include <video/of_display_timing.h>
 #include <video/videomode.h>
 
+#include "amba-clcd-nomadik.h"
+#include "amba-clcd-versatile.h"
+
 #define to_clcd(info)	container_of(info, struct clcd_fb, fb)
 
 /* This is limited to 16 characters when displayed by X startup */
@@ -71,6 +75,11 @@ static void clcdfb_disable(struct clcd_fb *fb)
 	if (fb->board->disable)
 		fb->board->disable(fb);
 
+	if (fb->panel->backlight) {
+		fb->panel->backlight->props.power = FB_BLANK_POWERDOWN;
+		backlight_update_status(fb->panel->backlight);
+	}
+
 	val = readl(fb->regs + fb->off_cntl);
 	if (val & CNTL_LCDPWR) {
 		val &= ~CNTL_LCDPWR;
@@ -117,6 +126,14 @@ static void clcdfb_enable(struct clcd_fb *fb, u32 cntl)
 	writel(cntl, fb->regs + fb->off_cntl);
 
 	/*
+	 * Turn on backlight
+	 */
+	if (fb->panel->backlight) {
+		fb->panel->backlight->props.power = FB_BLANK_UNBLANK;
+		backlight_update_status(fb->panel->backlight);
+	}
+
+	/*
 	 * finally, enable the interface.
 	 */
 	if (fb->board->enable)
@@ -211,6 +228,15 @@ clcdfb_set_bitfields(struct clcd_fb *fb, struct fb_var_screeninfo *var)
 			var->blue.length = 4;
 		}
 		break;
+	case 24:
+		if (fb->vendor->packed_24_bit_pixels) {
+			var->red.length = 8;
+			var->green.length = 8;
+			var->blue.length = 8;
+		} else {
+			ret = -EINVAL;
+		}
+		break;
 	case 32:
 		/* If we can't do 888, reject */
 		caps &= CLCD_CAP_888;
@@ -297,6 +323,12 @@ static int clcdfb_set_par(struct fb_info *info)
 
 	clcdfb_disable(fb);
 
+	/* Some variants must be clocked here */
+	if (fb->vendor->clock_timregs && !fb->clk_enabled) {
+		fb->clk_enabled = true;
+		clk_enable(fb->clk);
+	}
+
 	writel(regs.tim0, fb->regs + CLCD_TIM0);
 	writel(regs.tim1, fb->regs + CLCD_TIM1);
 	writel(regs.tim2, fb->regs + CLCD_TIM2);
@@ -551,7 +583,7 @@ static int clcdfb_register(struct clcd_fb *fb)
 
 #ifdef CONFIG_OF
 static int clcdfb_of_get_dpi_panel_mode(struct device_node *node,
-		struct fb_videomode *mode)
+		struct clcd_panel *clcd_panel)
 {
 	int err;
 	struct display_timing timing;
@@ -563,10 +595,31 @@ static int clcdfb_of_get_dpi_panel_mode(struct device_node *node,
 
 	videomode_from_timing(&timing, &video);
 
-	err = fb_videomode_from_videomode(&video, mode);
+	err = fb_videomode_from_videomode(&video, &clcd_panel->mode);
 	if (err)
 		return err;
 
+	/* Set up some inversion flags */
+	if (timing.flags & DISPLAY_FLAGS_PIXDATA_NEGEDGE)
+		clcd_panel->tim2 |= TIM2_IPC;
+	else if (!(timing.flags & DISPLAY_FLAGS_PIXDATA_POSEDGE))
+		/*
+		 * To preserve backwards compatibility, the IPC (inverted
+		 * pixel clock) flag needs to be set on any display that
+		 * doesn't explicitly specify that the pixel clock is
+		 * active on the negative or positive edge.
+		 */
+		clcd_panel->tim2 |= TIM2_IPC;
+
+	if (timing.flags & DISPLAY_FLAGS_HSYNC_LOW)
+		clcd_panel->tim2 |= TIM2_IHS;
+
+	if (timing.flags & DISPLAY_FLAGS_VSYNC_LOW)
+		clcd_panel->tim2 |= TIM2_IVS;
+
+	if (timing.flags & DISPLAY_FLAGS_DE_LOW)
+		clcd_panel->tim2 |= TIM2_IOE;
+
 	return 0;
 }
 
@@ -576,11 +629,34 @@ static int clcdfb_snprintf_mode(char *buf, int size, struct fb_videomode *mode)
 			mode->refresh);
 }
 
+static int clcdfb_of_get_backlight(struct device_node *endpoint,
+				   struct clcd_panel *clcd_panel)
+{
+	struct device_node *panel;
+	struct device_node *backlight;
+
+	panel = of_graph_get_remote_port_parent(endpoint);
+	if (!panel)
+		return -ENODEV;
+
+	/* Look up the optional backlight phandle */
+	backlight = of_parse_phandle(panel, "backlight", 0);
+	if (backlight) {
+		clcd_panel->backlight = of_find_backlight_by_node(backlight);
+		of_node_put(backlight);
+
+		if (!clcd_panel->backlight)
+			return -EPROBE_DEFER;
+	}
+	return 0;
+}
+
 static int clcdfb_of_get_mode(struct device *dev, struct device_node *endpoint,
-		struct fb_videomode *mode)
+		struct clcd_panel *clcd_panel)
 {
 	int err;
 	struct device_node *panel;
+	struct fb_videomode *mode;
 	char *name;
 	int len;
 
@@ -590,11 +666,12 @@ static int clcdfb_of_get_mode(struct device *dev, struct device_node *endpoint,
 
 	/* Only directly connected DPI panels supported for now */
 	if (of_device_is_compatible(panel, "panel-dpi"))
-		err = clcdfb_of_get_dpi_panel_mode(panel, mode);
+		err = clcdfb_of_get_dpi_panel_mode(panel, clcd_panel);
 	else
 		err = -ENOENT;
 	if (err)
 		return err;
+	mode = &clcd_panel->mode;
 
 	len = clcdfb_snprintf_mode(NULL, 0, mode);
 	name = devm_kzalloc(dev, len + 1, GFP_KERNEL);
@@ -616,6 +693,7 @@ static int clcdfb_of_init_tft_panel(struct clcd_fb *fb, u32 r0, u32 g0, u32 b0)
 	} panels[] = {
 		{ 0x110, 1,  7, 13, CLCD_CAP_5551 },
 		{ 0x110, 0,  8, 16, CLCD_CAP_888 },
+		{ 0x110, 16, 8, 0,  CLCD_CAP_888 },
 		{ 0x111, 4, 14, 20, CLCD_CAP_444 },
 		{ 0x111, 3, 11, 19, CLCD_CAP_444 | CLCD_CAP_5551 },
 		{ 0x111, 3, 10, 19, CLCD_CAP_444 | CLCD_CAP_5551 |
@@ -625,8 +703,8 @@ static int clcdfb_of_init_tft_panel(struct clcd_fb *fb, u32 r0, u32 g0, u32 b0)
 	};
 	int i;
 
-	/* Bypass pixel clock divider, data output on the falling edge */
-	fb->panel->tim2 = TIM2_BCD | TIM2_IPC;
+	/* Bypass pixel clock divider */
+	fb->panel->tim2 |= TIM2_BCD;
 
 	/* TFT display, vert. comp. interrupt at the start of the back porch */
 	fb->panel->cntl |= CNTL_LCDTFT | CNTL_LCDVCOMP(1);
@@ -643,6 +721,49 @@ static int clcdfb_of_init_tft_panel(struct clcd_fb *fb, u32 r0, u32 g0, u32 b0)
 			fb->panel->caps = panels[i].caps;
 	}
 
+	/*
+	 * If we actually physically connected the R lines to B and
+	 * vice versa
+	 */
+	if (r0 != 0 && b0 == 0)
+		fb->panel->bgr_connection = true;
+
+	if (fb->panel->caps && fb->vendor->st_bitmux_control) {
+		/*
+		 * Set up the special bits for the Nomadik control register
+		 * (other platforms tend to do this through an external
+		 * register).
+		 */
+
+		/* Offset of the highest used color */
+		int maxoff = max3(r0, g0, b0);
+		/* Most significant bit out, highest used bit */
+		int msb = 0;
+
+		if (fb->panel->caps & CLCD_CAP_888) {
+			msb = maxoff + 8 - 1;
+		} else if (fb->panel->caps & CLCD_CAP_565) {
+			msb = maxoff + 5 - 1;
+			fb->panel->cntl |= CNTL_ST_1XBPP_565;
+		} else if (fb->panel->caps & CLCD_CAP_5551) {
+			msb = maxoff + 5 - 1;
+			fb->panel->cntl |= CNTL_ST_1XBPP_5551;
+		} else if (fb->panel->caps & CLCD_CAP_444) {
+			msb = maxoff + 4 - 1;
+			fb->panel->cntl |= CNTL_ST_1XBPP_444;
+		}
+
+		/* Send out as many bits as we need */
+		if (msb > 17)
+			fb->panel->cntl |= CNTL_ST_CDWID_24;
+		else if (msb > 15)
+			fb->panel->cntl |= CNTL_ST_CDWID_18;
+		else if (msb > 11)
+			fb->panel->cntl |= CNTL_ST_CDWID_16;
+		else
+			fb->panel->cntl |= CNTL_ST_CDWID_12;
+	}
+
 	return fb->panel->caps ? 0 : -EINVAL;
 }
 
@@ -658,11 +779,24 @@ static int clcdfb_of_init_display(struct clcd_fb *fb)
 	if (!fb->panel)
 		return -ENOMEM;
 
+	/*
+	 * Fetch the panel endpoint.
+	 */
 	endpoint = of_graph_get_next_endpoint(fb->dev->dev.of_node, NULL);
 	if (!endpoint)
 		return -ENODEV;
 
-	err = clcdfb_of_get_mode(&fb->dev->dev, endpoint, &fb->panel->mode);
+	if (fb->vendor->init_panel) {
+		err = fb->vendor->init_panel(fb, endpoint);
+		if (err)
+			return err;
+	}
+
+	err = clcdfb_of_get_backlight(endpoint, fb->panel);
+	if (err)
+		return err;
+
+	err = clcdfb_of_get_mode(&fb->dev->dev, endpoint, fb->panel);
 	if (err)
 		return err;
 
@@ -693,11 +827,11 @@ static int clcdfb_of_init_display(struct clcd_fb *fb)
 
 	if (of_property_read_u32_array(endpoint,
 			"arm,pl11x,tft-r0g0b0-pads",
-			tft_r0b0g0, ARRAY_SIZE(tft_r0b0g0)) == 0)
-		return clcdfb_of_init_tft_panel(fb, tft_r0b0g0[0],
-				 tft_r0b0g0[1],  tft_r0b0g0[2]);
+			tft_r0b0g0, ARRAY_SIZE(tft_r0b0g0)) != 0)
+		return -ENOENT;
 
-	return -ENOENT;
+	return clcdfb_of_init_tft_panel(fb, tft_r0b0g0[0],
+					tft_r0b0g0[1],  tft_r0b0g0[2]);
 }
 
 static int clcdfb_of_vram_setup(struct clcd_fb *fb)
@@ -818,6 +952,7 @@ static struct clcd_board *clcdfb_of_get_board(struct amba_device *dev)
 static int clcdfb_probe(struct amba_device *dev, const struct amba_id *id)
 {
 	struct clcd_board *board = dev_get_platdata(&dev->dev);
+	struct clcd_vendor_data *vendor = id->data;
 	struct clcd_fb *fb;
 	int ret;
 
@@ -827,6 +962,12 @@ static int clcdfb_probe(struct amba_device *dev, const struct amba_id *id)
 	if (!board)
 		return -EINVAL;
 
+	if (vendor->init_board) {
+		ret = vendor->init_board(dev, board);
+		if (ret)
+			return ret;
+	}
+
 	ret = dma_set_mask_and_coherent(&dev->dev, DMA_BIT_MASK(32));
 	if (ret)
 		goto out;
@@ -845,17 +986,18 @@ static int clcdfb_probe(struct amba_device *dev, const struct amba_id *id)
 	}
 
 	fb->dev = dev;
+	fb->vendor = vendor;
 	fb->board = board;
 
-	dev_info(&fb->dev->dev, "PL%03x rev%u at 0x%08llx\n",
-		amba_part(dev), amba_rev(dev),
+	dev_info(&fb->dev->dev, "PL%03x designer %02x rev%u at 0x%08llx\n",
+		amba_part(dev), amba_manf(dev), amba_rev(dev),
 		(unsigned long long)dev->res.start);
 
 	ret = fb->board->setup(fb);
 	if (ret)
 		goto free_fb;
 
-	ret = clcdfb_register(fb); 
+	ret = clcdfb_register(fb);
 	if (ret == 0) {
 		amba_set_drvdata(dev, fb);
 		goto out;
@@ -891,10 +1033,30 @@ static int clcdfb_remove(struct amba_device *dev)
 	return 0;
 }
 
+static struct clcd_vendor_data vendor_arm = {
+	/* Sets up the versatile board displays */
+	.init_panel = versatile_clcd_init_panel,
+};
+
+static struct clcd_vendor_data vendor_nomadik = {
+	.clock_timregs = true,
+	.packed_24_bit_pixels = true,
+	.st_bitmux_control = true,
+	.init_board = nomadik_clcd_init_board,
+	.init_panel = nomadik_clcd_init_panel,
+};
+
 static struct amba_id clcdfb_id_table[] = {
 	{
 		.id	= 0x00041110,
 		.mask	= 0x000ffffe,
+		.data	= &vendor_arm,
+	},
+	/* ST Electronics Nomadik variant */
+	{
+		.id	= 0x00180110,
+		.mask	= 0x00fffffe,
+		.data	= &vendor_nomadik,
 	},
 	{ 0, 0 },
 };
diff --git a/drivers/video/fbdev/arcfb.c b/drivers/video/fbdev/arcfb.c
index 1b0b233b8b39..1928cb2b5386 100644
--- a/drivers/video/fbdev/arcfb.c
+++ b/drivers/video/fbdev/arcfb.c
@@ -79,7 +79,7 @@ struct arcfb_par {
 	spinlock_t lock;
 };
 
-static struct fb_fix_screeninfo arcfb_fix = {
+static const struct fb_fix_screeninfo arcfb_fix = {
 	.id =		"arcfb",
 	.type =		FB_TYPE_PACKED_PIXELS,
 	.visual =	FB_VISUAL_MONO01,
@@ -89,7 +89,7 @@ static struct fb_fix_screeninfo arcfb_fix = {
 	.accel =	FB_ACCEL_NONE,
 };
 
-static struct fb_var_screeninfo arcfb_var = {
+static const struct fb_var_screeninfo arcfb_var = {
 	.xres		= 128,
 	.yres		= 64,
 	.xres_virtual	= 128,
diff --git a/drivers/video/fbdev/asiliantfb.c b/drivers/video/fbdev/asiliantfb.c
index 7e8ddf00ccc2..91eea4583382 100644
--- a/drivers/video/fbdev/asiliantfb.c
+++ b/drivers/video/fbdev/asiliantfb.c
@@ -474,7 +474,7 @@ static void chips_hw_init(struct fb_info *p)
 		write_fr(chips_init_fr[i].addr, chips_init_fr[i].data);
 }
 
-static struct fb_fix_screeninfo asiliantfb_fix = {
+static const struct fb_fix_screeninfo asiliantfb_fix = {
 	.id =		"Asiliant 69000",
 	.type =		FB_TYPE_PACKED_PIXELS,
 	.visual =	FB_VISUAL_PSEUDOCOLOR,
@@ -483,7 +483,7 @@ static struct fb_fix_screeninfo asiliantfb_fix = {
 	.smem_len =	0x200000,	/* 2MB */
 };
 
-static struct fb_var_screeninfo asiliantfb_var = {
+static const struct fb_var_screeninfo asiliantfb_var = {
 	.xres 		= 640,
 	.yres 		= 480,
 	.xres_virtual 	= 640,
diff --git a/drivers/video/fbdev/aty/aty128fb.c b/drivers/video/fbdev/aty/aty128fb.c
index 0a4626886b00..fa07242a78d2 100644
--- a/drivers/video/fbdev/aty/aty128fb.c
+++ b/drivers/video/fbdev/aty/aty128fb.c
@@ -93,7 +93,7 @@
 
 #ifndef CONFIG_PPC_PMAC
 /* default mode */
-static struct fb_var_screeninfo default_var = {
+static const struct fb_var_screeninfo default_var = {
 	/* 640x480, 60 Hz, Non-Interlaced (25.175 MHz dotclock) */
 	640, 480, 640, 480, 0, 0, 8, 0,
 	{0, 8, 0}, {0, 8, 0}, {0, 8, 0}, {0, 0, 0},
@@ -104,7 +104,7 @@ static struct fb_var_screeninfo default_var = {
 #else /* CONFIG_PPC_PMAC */
 /* default to 1024x768 at 75Hz on PPC - this will work
  * on the iMac, the usual 640x480 @ 60Hz doesn't. */
-static struct fb_var_screeninfo default_var = {
+static const struct fb_var_screeninfo default_var = {
 	/* 1024x768, 75 Hz, Non-Interlaced (78.75 MHz dotclock) */
 	1024, 768, 1024, 768, 0, 0, 8, 0,
 	{0, 8, 0}, {0, 8, 0}, {0, 8, 0}, {0, 0, 0},
@@ -375,7 +375,7 @@ static const struct aty128_meminfo ddr_sgram = {
 	.name = "64-bit DDR SGRAM",
 };
 
-static struct fb_fix_screeninfo aty128fb_fix = {
+static const struct fb_fix_screeninfo aty128fb_fix = {
 	.id		= "ATY Rage128",
 	.type		= FB_TYPE_PACKED_PIXELS,
 	.visual		= FB_VISUAL_PSEUDOCOLOR,
diff --git a/drivers/video/fbdev/aty/atyfb_base.c b/drivers/video/fbdev/aty/atyfb_base.c
index f34ed47fcaf8..11026e726b68 100644
--- a/drivers/video/fbdev/aty/atyfb_base.c
+++ b/drivers/video/fbdev/aty/atyfb_base.c
@@ -212,7 +212,7 @@ struct pci_mmap_map {
 	unsigned long prot_mask;
 };
 
-static struct fb_fix_screeninfo atyfb_fix = {
+static const struct fb_fix_screeninfo atyfb_fix = {
 	.id		= "ATY Mach64",
 	.type		= FB_TYPE_PACKED_PIXELS,
 	.visual		= FB_VISUAL_PSEUDOCOLOR,
diff --git a/drivers/video/fbdev/aty/radeon_monitor.c b/drivers/video/fbdev/aty/radeon_monitor.c
index f1ce229de78d..278b421ab3fe 100644
--- a/drivers/video/fbdev/aty/radeon_monitor.c
+++ b/drivers/video/fbdev/aty/radeon_monitor.c
@@ -4,7 +4,7 @@
 
 #include "../edid.h"
 
-static struct fb_var_screeninfo radeonfb_default_var = {
+static const struct fb_var_screeninfo radeonfb_default_var = {
 	.xres		= 640,
 	.yres		= 480,
 	.xres_virtual	= 640,
diff --git a/drivers/video/fbdev/au1200fb.c b/drivers/video/fbdev/au1200fb.c
index f9507b1894df..6c2b2ca4a909 100644
--- a/drivers/video/fbdev/au1200fb.c
+++ b/drivers/video/fbdev/au1200fb.c
@@ -43,6 +43,7 @@
 #include <linux/ctype.h>
 #include <linux/dma-mapping.h>
 #include <linux/slab.h>
+#include <linux/uaccess.h>
 
 #include <asm/mach-au1x00/au1000.h>
 #include <asm/mach-au1x00/au1200fb.h>	/* platform_data */
diff --git a/drivers/video/fbdev/bfin_adv7393fb.c b/drivers/video/fbdev/bfin_adv7393fb.c
index e2d7d039ce3b..542ffaddc6ab 100644
--- a/drivers/video/fbdev/bfin_adv7393fb.c
+++ b/drivers/video/fbdev/bfin_adv7393fb.c
@@ -375,7 +375,6 @@ static int bfin_adv7393_fb_probe(struct i2c_client *client,
 {
 	int ret = 0;
 	struct proc_dir_entry *entry;
-	int num_modes = ARRAY_SIZE(known_modes);
 
 	struct adv7393fb_device *fbdev = NULL;
 
@@ -384,7 +383,7 @@ static int bfin_adv7393_fb_probe(struct i2c_client *client,
 		return -EINVAL;
 	}
 
-	if (mode > num_modes) {
+	if (mode >= ARRAY_SIZE(known_modes)) {
 		dev_err(&client->dev, "mode %d: not supported", mode);
 		return -EFAULT;
 	}
@@ -797,7 +796,7 @@ static struct i2c_driver bfin_adv7393_fb_driver = {
 
 static int __init bfin_adv7393_fb_driver_init(void)
 {
-#if  defined(CONFIG_I2C_BLACKFIN_TWI) || defined(CONFIG_I2C_BLACKFIN_TWI_MODULE)
+#if IS_ENABLED(CONFIG_I2C_BLACKFIN_TWI)
 	request_module("i2c-bfin-twi");
 #else
 	request_module("i2c-gpio");
diff --git a/drivers/video/fbdev/efifb.c b/drivers/video/fbdev/efifb.c
index 924bad45c176..37a37c4d04cb 100644
--- a/drivers/video/fbdev/efifb.c
+++ b/drivers/video/fbdev/efifb.c
@@ -50,9 +50,9 @@ static int efifb_setcolreg(unsigned regno, unsigned red, unsigned green,
 		return 1;
 
 	if (regno < 16) {
-		red   >>= 8;
-		green >>= 8;
-		blue  >>= 8;
+		red   >>= 16 - info->var.red.length;
+		green >>= 16 - info->var.green.length;
+		blue  >>= 16 - info->var.blue.length;
 		((u32 *)(info->pseudo_palette))[regno] =
 			(red   << info->var.red.offset)   |
 			(green << info->var.green.offset) |
diff --git a/drivers/video/fbdev/exynos/Kconfig b/drivers/video/fbdev/exynos/Kconfig
deleted file mode 100644
index d916bef94f25..000000000000
--- a/drivers/video/fbdev/exynos/Kconfig
+++ /dev/null
@@ -1,32 +0,0 @@
-#
-# Exynos Video configuration
-#
-
-menuconfig EXYNOS_VIDEO
-	tristate "Exynos Video driver support"
-	depends on ARCH_S5PV210 || ARCH_EXYNOS
-	help
-	  This enables support for EXYNOS Video device.
-
-if EXYNOS_VIDEO
-
-#
-# MIPI DSI driver
-#
-
-config EXYNOS_MIPI_DSI
-	tristate "EXYNOS MIPI DSI driver support."
-	select GENERIC_PHY
-	help
-	  This enables support for MIPI-DSI device.
-
-config EXYNOS_LCD_S6E8AX0
-	tristate "S6E8AX0 MIPI AMOLED LCD Driver"
-	depends on EXYNOS_MIPI_DSI && BACKLIGHT_CLASS_DEVICE
-	depends on (LCD_CLASS_DEVICE = y)
-	default n
-	help
-	  If you have an S6E8AX0 MIPI AMOLED LCD Panel, say Y to enable its
-	  LCD control driver.
-
-endif # EXYNOS_VIDEO
diff --git a/drivers/video/fbdev/exynos/Makefile b/drivers/video/fbdev/exynos/Makefile
deleted file mode 100644
index 02d8dc522fea..000000000000
--- a/drivers/video/fbdev/exynos/Makefile
+++ /dev/null
@@ -1,9 +0,0 @@
-#
-# Makefile for the exynos video drivers.
-#
-
-obj-$(CONFIG_EXYNOS_MIPI_DSI)		+= exynos-mipi-dsi-mod.o
-
-exynos-mipi-dsi-mod-objs		+= exynos_mipi_dsi.o exynos_mipi_dsi_common.o \
-					   exynos_mipi_dsi_lowlevel.o
-obj-$(CONFIG_EXYNOS_LCD_S6E8AX0)	+= s6e8ax0.o
diff --git a/drivers/video/fbdev/exynos/exynos_mipi_dsi.c b/drivers/video/fbdev/exynos/exynos_mipi_dsi.c
deleted file mode 100644
index 92e4af3caaf8..000000000000
--- a/drivers/video/fbdev/exynos/exynos_mipi_dsi.c
+++ /dev/null
@@ -1,574 +0,0 @@
-/* linux/drivers/video/exynos/exynos_mipi_dsi.c
- *
- * Samsung SoC MIPI-DSIM driver.
- *
- * Copyright (c) 2012 Samsung Electronics Co., Ltd
- *
- * InKi Dae, <inki.dae@samsung.com>
- * Donghwa Lee, <dh09.lee@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/clk.h>
-#include <linux/mutex.h>
-#include <linux/wait.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/fb.h>
-#include <linux/ctype.h>
-#include <linux/platform_device.h>
-#include <linux/io.h>
-#include <linux/irq.h>
-#include <linux/memory.h>
-#include <linux/delay.h>
-#include <linux/interrupt.h>
-#include <linux/kthread.h>
-#include <linux/notifier.h>
-#include <linux/phy/phy.h>
-#include <linux/regulator/consumer.h>
-#include <linux/pm_runtime.h>
-#include <linux/err.h>
-
-#include <video/exynos_mipi_dsim.h>
-
-#include "exynos_mipi_dsi_common.h"
-#include "exynos_mipi_dsi_lowlevel.h"
-
-struct mipi_dsim_ddi {
-	int				bus_id;
-	struct list_head		list;
-	struct mipi_dsim_lcd_device	*dsim_lcd_dev;
-	struct mipi_dsim_lcd_driver	*dsim_lcd_drv;
-};
-
-static LIST_HEAD(dsim_ddi_list);
-
-static DEFINE_MUTEX(mipi_dsim_lock);
-
-static struct mipi_dsim_platform_data *to_dsim_plat(struct platform_device
-							*pdev)
-{
-	return pdev->dev.platform_data;
-}
-
-static struct regulator_bulk_data supplies[] = {
-	{ .supply = "vdd11", },
-	{ .supply = "vdd18", },
-};
-
-static int exynos_mipi_regulator_enable(struct mipi_dsim_device *dsim)
-{
-	int ret;
-
-	mutex_lock(&dsim->lock);
-	ret = regulator_bulk_enable(ARRAY_SIZE(supplies), supplies);
-	mutex_unlock(&dsim->lock);
-
-	return ret;
-}
-
-static int exynos_mipi_regulator_disable(struct mipi_dsim_device *dsim)
-{
-	int ret;
-
-	mutex_lock(&dsim->lock);
-	ret = regulator_bulk_disable(ARRAY_SIZE(supplies), supplies);
-	mutex_unlock(&dsim->lock);
-
-	return ret;
-}
-
-/* update all register settings to MIPI DSI controller. */
-static void exynos_mipi_update_cfg(struct mipi_dsim_device *dsim)
-{
-	/*
-	 * data from Display controller(FIMD) is not transferred in video mode
-	 * but in case of command mode, all settings is not updated to
-	 * registers.
-	 */
-	exynos_mipi_dsi_stand_by(dsim, 0);
-
-	exynos_mipi_dsi_init_dsim(dsim);
-	exynos_mipi_dsi_init_link(dsim);
-
-	exynos_mipi_dsi_set_hs_enable(dsim);
-
-	/* set display timing. */
-	exynos_mipi_dsi_set_display_mode(dsim, dsim->dsim_config);
-
-	exynos_mipi_dsi_init_interrupt(dsim);
-
-	/*
-	 * data from Display controller(FIMD) is transferred in video mode
-	 * but in case of command mode, all settings are updated to registers.
-	 */
-	exynos_mipi_dsi_stand_by(dsim, 1);
-}
-
-static int exynos_mipi_dsi_early_blank_mode(struct mipi_dsim_device *dsim,
-		int power)
-{
-	struct mipi_dsim_lcd_driver *client_drv = dsim->dsim_lcd_drv;
-	struct mipi_dsim_lcd_device *client_dev = dsim->dsim_lcd_dev;
-
-	switch (power) {
-	case FB_BLANK_POWERDOWN:
-		if (dsim->suspended)
-			return 0;
-
-		if (client_drv && client_drv->suspend)
-			client_drv->suspend(client_dev);
-
-		clk_disable(dsim->clock);
-
-		exynos_mipi_regulator_disable(dsim);
-
-		dsim->suspended = true;
-
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-
-static int exynos_mipi_dsi_blank_mode(struct mipi_dsim_device *dsim, int power)
-{
-	struct mipi_dsim_lcd_driver *client_drv = dsim->dsim_lcd_drv;
-	struct mipi_dsim_lcd_device *client_dev = dsim->dsim_lcd_dev;
-
-	switch (power) {
-	case FB_BLANK_UNBLANK:
-		if (!dsim->suspended)
-			return 0;
-
-		/* lcd panel power on. */
-		if (client_drv && client_drv->power_on)
-			client_drv->power_on(client_dev, 1);
-
-		exynos_mipi_regulator_enable(dsim);
-
-		/* enable MIPI-DSI PHY. */
-		phy_power_on(dsim->phy);
-
-		clk_enable(dsim->clock);
-
-		exynos_mipi_update_cfg(dsim);
-
-		/* set lcd panel sequence commands. */
-		if (client_drv && client_drv->set_sequence)
-			client_drv->set_sequence(client_dev);
-
-		dsim->suspended = false;
-
-		break;
-	case FB_BLANK_NORMAL:
-		/* TODO. */
-		break;
-	default:
-		break;
-	}
-
-	return 0;
-}
-
-int exynos_mipi_dsi_register_lcd_device(struct mipi_dsim_lcd_device *lcd_dev)
-{
-	struct mipi_dsim_ddi *dsim_ddi;
-
-	if (!lcd_dev->name) {
-		pr_err("dsim_lcd_device name is NULL.\n");
-		return -EFAULT;
-	}
-
-	dsim_ddi = kzalloc(sizeof(struct mipi_dsim_ddi), GFP_KERNEL);
-	if (!dsim_ddi) {
-		pr_err("failed to allocate dsim_ddi object.\n");
-		return -ENOMEM;
-	}
-
-	dsim_ddi->dsim_lcd_dev = lcd_dev;
-
-	mutex_lock(&mipi_dsim_lock);
-	list_add_tail(&dsim_ddi->list, &dsim_ddi_list);
-	mutex_unlock(&mipi_dsim_lock);
-
-	return 0;
-}
-
-static struct mipi_dsim_ddi *exynos_mipi_dsi_find_lcd_device(
-					struct mipi_dsim_lcd_driver *lcd_drv)
-{
-	struct mipi_dsim_ddi *dsim_ddi, *next;
-	struct mipi_dsim_lcd_device *lcd_dev;
-
-	mutex_lock(&mipi_dsim_lock);
-
-	list_for_each_entry_safe(dsim_ddi, next, &dsim_ddi_list, list) {
-		if (!dsim_ddi)
-			goto out;
-
-		lcd_dev = dsim_ddi->dsim_lcd_dev;
-		if (!lcd_dev)
-			continue;
-
-		if ((strcmp(lcd_drv->name, lcd_dev->name)) == 0) {
-			/**
-			 * bus_id would be used to identify
-			 * connected bus.
-			 */
-			dsim_ddi->bus_id = lcd_dev->bus_id;
-			mutex_unlock(&mipi_dsim_lock);
-
-			return dsim_ddi;
-		}
-
-		list_del(&dsim_ddi->list);
-		kfree(dsim_ddi);
-	}
-
-out:
-	mutex_unlock(&mipi_dsim_lock);
-
-	return NULL;
-}
-
-int exynos_mipi_dsi_register_lcd_driver(struct mipi_dsim_lcd_driver *lcd_drv)
-{
-	struct mipi_dsim_ddi *dsim_ddi;
-
-	if (!lcd_drv->name) {
-		pr_err("dsim_lcd_driver name is NULL.\n");
-		return -EFAULT;
-	}
-
-	dsim_ddi = exynos_mipi_dsi_find_lcd_device(lcd_drv);
-	if (!dsim_ddi) {
-		pr_err("mipi_dsim_ddi object not found.\n");
-		return -EFAULT;
-	}
-
-	dsim_ddi->dsim_lcd_drv = lcd_drv;
-
-	pr_info("registered panel driver(%s) to mipi-dsi driver.\n",
-		lcd_drv->name);
-
-	return 0;
-
-}
-EXPORT_SYMBOL_GPL(exynos_mipi_dsi_register_lcd_driver);
-
-static struct mipi_dsim_ddi *exynos_mipi_dsi_bind_lcd_ddi(
-						struct mipi_dsim_device *dsim,
-						const char *name)
-{
-	struct mipi_dsim_ddi *dsim_ddi, *next;
-	struct mipi_dsim_lcd_driver *lcd_drv;
-	struct mipi_dsim_lcd_device *lcd_dev;
-	int ret;
-
-	mutex_lock(&dsim->lock);
-
-	list_for_each_entry_safe(dsim_ddi, next, &dsim_ddi_list, list) {
-		lcd_drv = dsim_ddi->dsim_lcd_drv;
-		lcd_dev = dsim_ddi->dsim_lcd_dev;
-		if (!lcd_drv || !lcd_dev ||
-			(dsim->id != dsim_ddi->bus_id))
-				continue;
-
-		dev_dbg(dsim->dev, "lcd_drv->id = %d, lcd_dev->id = %d\n",
-				lcd_drv->id, lcd_dev->id);
-		dev_dbg(dsim->dev, "lcd_dev->bus_id = %d, dsim->id = %d\n",
-				lcd_dev->bus_id, dsim->id);
-
-		if ((strcmp(lcd_drv->name, name) == 0)) {
-			lcd_dev->master = dsim;
-
-			lcd_dev->dev.parent = dsim->dev;
-			dev_set_name(&lcd_dev->dev, "%s", lcd_drv->name);
-
-			ret = device_register(&lcd_dev->dev);
-			if (ret < 0) {
-				dev_err(dsim->dev,
-					"can't register %s, status %d\n",
-					dev_name(&lcd_dev->dev), ret);
-				mutex_unlock(&dsim->lock);
-
-				return NULL;
-			}
-
-			dsim->dsim_lcd_dev = lcd_dev;
-			dsim->dsim_lcd_drv = lcd_drv;
-
-			mutex_unlock(&dsim->lock);
-
-			return dsim_ddi;
-		}
-	}
-
-	mutex_unlock(&dsim->lock);
-
-	return NULL;
-}
-
-/* define MIPI-DSI Master operations. */
-static struct mipi_dsim_master_ops master_ops = {
-	.cmd_read			= exynos_mipi_dsi_rd_data,
-	.cmd_write			= exynos_mipi_dsi_wr_data,
-	.get_dsim_frame_done		= exynos_mipi_dsi_get_frame_done_status,
-	.clear_dsim_frame_done		= exynos_mipi_dsi_clear_frame_done,
-	.set_early_blank_mode		= exynos_mipi_dsi_early_blank_mode,
-	.set_blank_mode			= exynos_mipi_dsi_blank_mode,
-};
-
-static int exynos_mipi_dsi_probe(struct platform_device *pdev)
-{
-	struct resource *res;
-	struct mipi_dsim_device *dsim;
-	struct mipi_dsim_config *dsim_config;
-	struct mipi_dsim_platform_data *dsim_pd;
-	struct mipi_dsim_ddi *dsim_ddi;
-	int ret = -EINVAL;
-
-	dsim = devm_kzalloc(&pdev->dev, sizeof(struct mipi_dsim_device),
-				GFP_KERNEL);
-	if (!dsim) {
-		dev_err(&pdev->dev, "failed to allocate dsim object.\n");
-		return -ENOMEM;
-	}
-
-	dsim->pd = to_dsim_plat(pdev);
-	dsim->dev = &pdev->dev;
-	dsim->id = pdev->id;
-
-	/* get mipi_dsim_platform_data. */
-	dsim_pd = (struct mipi_dsim_platform_data *)dsim->pd;
-	if (dsim_pd == NULL) {
-		dev_err(&pdev->dev, "failed to get platform data for dsim.\n");
-		return -EINVAL;
-	}
-	/* get mipi_dsim_config. */
-	dsim_config = dsim_pd->dsim_config;
-	if (dsim_config == NULL) {
-		dev_err(&pdev->dev, "failed to get dsim config data.\n");
-		return -EINVAL;
-	}
-
-	dsim->dsim_config = dsim_config;
-	dsim->master_ops = &master_ops;
-
-	mutex_init(&dsim->lock);
-
-	ret = devm_regulator_bulk_get(&pdev->dev, ARRAY_SIZE(supplies),
-					supplies);
-	if (ret) {
-		dev_err(&pdev->dev, "Failed to get regulators: %d\n", ret);
-		return ret;
-	}
-
-	dsim->phy = devm_phy_get(&pdev->dev, "dsim");
-	if (IS_ERR(dsim->phy))
-		return PTR_ERR(dsim->phy);
-
-	dsim->clock = devm_clk_get(&pdev->dev, "dsim0");
-	if (IS_ERR(dsim->clock)) {
-		dev_err(&pdev->dev, "failed to get dsim clock source\n");
-		return -ENODEV;
-	}
-
-	clk_enable(dsim->clock);
-
-	res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
-
-	dsim->reg_base = devm_ioremap_resource(&pdev->dev, res);
-	if (IS_ERR(dsim->reg_base)) {
-		ret = PTR_ERR(dsim->reg_base);
-		goto error;
-	}
-
-	mutex_init(&dsim->lock);
-
-	/* bind lcd ddi matched with panel name. */
-	dsim_ddi = exynos_mipi_dsi_bind_lcd_ddi(dsim, dsim_pd->lcd_panel_name);
-	if (!dsim_ddi) {
-		dev_err(&pdev->dev, "mipi_dsim_ddi object not found.\n");
-		ret = -EINVAL;
-		goto error;
-	}
-
-	ret = platform_get_irq(pdev, 0);
-	if (ret < 0) {
-		dev_err(&pdev->dev, "failed to request dsim irq resource\n");
-		goto error;
-	}
-	dsim->irq = ret;
-
-	init_completion(&dsim_wr_comp);
-	init_completion(&dsim_rd_comp);
-	platform_set_drvdata(pdev, dsim);
-
-	ret = devm_request_irq(&pdev->dev, dsim->irq,
-			exynos_mipi_dsi_interrupt_handler,
-			IRQF_SHARED, dev_name(&pdev->dev), dsim);
-	if (ret != 0) {
-		dev_err(&pdev->dev, "failed to request dsim irq\n");
-		ret = -EINVAL;
-		goto error;
-	}
-
-	/* enable interrupts */
-	exynos_mipi_dsi_init_interrupt(dsim);
-
-	/* initialize mipi-dsi client(lcd panel). */
-	if (dsim_ddi->dsim_lcd_drv && dsim_ddi->dsim_lcd_drv->probe)
-		dsim_ddi->dsim_lcd_drv->probe(dsim_ddi->dsim_lcd_dev);
-
-	/* in case mipi-dsi has been enabled by bootloader */
-	if (dsim_pd->enabled) {
-		exynos_mipi_regulator_enable(dsim);
-		goto done;
-	}
-
-	/* lcd panel power on. */
-	if (dsim_ddi->dsim_lcd_drv && dsim_ddi->dsim_lcd_drv->power_on)
-		dsim_ddi->dsim_lcd_drv->power_on(dsim_ddi->dsim_lcd_dev, 1);
-
-	exynos_mipi_regulator_enable(dsim);
-
-	/* enable MIPI-DSI PHY. */
-	phy_power_on(dsim->phy);
-
-	exynos_mipi_update_cfg(dsim);
-
-	/* set lcd panel sequence commands. */
-	if (dsim_ddi->dsim_lcd_drv && dsim_ddi->dsim_lcd_drv->set_sequence)
-		dsim_ddi->dsim_lcd_drv->set_sequence(dsim_ddi->dsim_lcd_dev);
-
-	dsim->suspended = false;
-
-done:
-	platform_set_drvdata(pdev, dsim);
-
-	dev_dbg(&pdev->dev, "%s() completed successfully (%s mode)\n", __func__,
-		dsim_config->e_interface == DSIM_COMMAND ? "CPU" : "RGB");
-
-	return 0;
-
-error:
-	clk_disable(dsim->clock);
-	return ret;
-}
-
-static int exynos_mipi_dsi_remove(struct platform_device *pdev)
-{
-	struct mipi_dsim_device *dsim = platform_get_drvdata(pdev);
-	struct mipi_dsim_ddi *dsim_ddi, *next;
-	struct mipi_dsim_lcd_driver *dsim_lcd_drv;
-
-	clk_disable(dsim->clock);
-
-	list_for_each_entry_safe(dsim_ddi, next, &dsim_ddi_list, list) {
-		if (dsim_ddi) {
-			if (dsim->id != dsim_ddi->bus_id)
-				continue;
-
-			dsim_lcd_drv = dsim_ddi->dsim_lcd_drv;
-
-			if (dsim_lcd_drv->remove)
-				dsim_lcd_drv->remove(dsim_ddi->dsim_lcd_dev);
-
-			kfree(dsim_ddi);
-		}
-	}
-
-	return 0;
-}
-
-#ifdef CONFIG_PM_SLEEP
-static int exynos_mipi_dsi_suspend(struct device *dev)
-{
-	struct platform_device *pdev = to_platform_device(dev);
-	struct mipi_dsim_device *dsim = platform_get_drvdata(pdev);
-	struct mipi_dsim_lcd_driver *client_drv = dsim->dsim_lcd_drv;
-	struct mipi_dsim_lcd_device *client_dev = dsim->dsim_lcd_dev;
-
-	disable_irq(dsim->irq);
-
-	if (dsim->suspended)
-		return 0;
-
-	if (client_drv && client_drv->suspend)
-		client_drv->suspend(client_dev);
-
-	/* disable MIPI-DSI PHY. */
-	phy_power_off(dsim->phy);
-
-	clk_disable(dsim->clock);
-
-	exynos_mipi_regulator_disable(dsim);
-
-	dsim->suspended = true;
-
-	return 0;
-}
-
-static int exynos_mipi_dsi_resume(struct device *dev)
-{
-	struct platform_device *pdev = to_platform_device(dev);
-	struct mipi_dsim_device *dsim = platform_get_drvdata(pdev);
-	struct mipi_dsim_lcd_driver *client_drv = dsim->dsim_lcd_drv;
-	struct mipi_dsim_lcd_device *client_dev = dsim->dsim_lcd_dev;
-
-	enable_irq(dsim->irq);
-
-	if (!dsim->suspended)
-		return 0;
-
-	/* lcd panel power on. */
-	if (client_drv && client_drv->power_on)
-		client_drv->power_on(client_dev, 1);
-
-	exynos_mipi_regulator_enable(dsim);
-
-	/* enable MIPI-DSI PHY. */
-	phy_power_on(dsim->phy);
-
-	clk_enable(dsim->clock);
-
-	exynos_mipi_update_cfg(dsim);
-
-	/* set lcd panel sequence commands. */
-	if (client_drv && client_drv->set_sequence)
-		client_drv->set_sequence(client_dev);
-
-	dsim->suspended = false;
-
-	return 0;
-}
-#endif
-
-static const struct dev_pm_ops exynos_mipi_dsi_pm_ops = {
-	SET_SYSTEM_SLEEP_PM_OPS(exynos_mipi_dsi_suspend, exynos_mipi_dsi_resume)
-};
-
-static struct platform_driver exynos_mipi_dsi_driver = {
-	.probe = exynos_mipi_dsi_probe,
-	.remove = exynos_mipi_dsi_remove,
-	.driver = {
-		   .name = "exynos-mipi-dsim",
-		   .pm = &exynos_mipi_dsi_pm_ops,
-	},
-};
-
-module_platform_driver(exynos_mipi_dsi_driver);
-
-MODULE_AUTHOR("InKi Dae <inki.dae@samsung.com>");
-MODULE_DESCRIPTION("Samsung SoC MIPI-DSI driver");
-MODULE_LICENSE("GPL");
diff --git a/drivers/video/fbdev/exynos/exynos_mipi_dsi_common.c b/drivers/video/fbdev/exynos/exynos_mipi_dsi_common.c
deleted file mode 100644
index 2358a2fbbbcd..000000000000
--- a/drivers/video/fbdev/exynos/exynos_mipi_dsi_common.c
+++ /dev/null
@@ -1,880 +0,0 @@
-/* linux/drivers/video/exynos/exynos_mipi_dsi_common.c
- *
- * Samsung SoC MIPI-DSI common driver.
- *
- * Copyright (c) 2012 Samsung Electronics Co., Ltd
- *
- * InKi Dae, <inki.dae@samsung.com>
- * Donghwa Lee, <dh09.lee@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/mutex.h>
-#include <linux/wait.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/fb.h>
-#include <linux/ctype.h>
-#include <linux/platform_device.h>
-#include <linux/io.h>
-#include <linux/memory.h>
-#include <linux/delay.h>
-#include <linux/irqreturn.h>
-#include <linux/kthread.h>
-
-#include <video/mipi_display.h>
-#include <video/exynos_mipi_dsim.h>
-
-#include "exynos_mipi_dsi_regs.h"
-#include "exynos_mipi_dsi_lowlevel.h"
-#include "exynos_mipi_dsi_common.h"
-
-#define MIPI_FIFO_TIMEOUT	msecs_to_jiffies(250)
-#define MIPI_RX_FIFO_READ_DONE  0x30800002
-#define MIPI_MAX_RX_FIFO        20
-#define MHZ			(1000 * 1000)
-#define FIN_HZ			(24 * MHZ)
-
-#define DFIN_PLL_MIN_HZ		(6 * MHZ)
-#define DFIN_PLL_MAX_HZ		(12 * MHZ)
-
-#define DFVCO_MIN_HZ		(500 * MHZ)
-#define DFVCO_MAX_HZ		(1000 * MHZ)
-
-#define TRY_GET_FIFO_TIMEOUT	(5000 * 2)
-#define TRY_FIFO_CLEAR		(10)
-
-/* MIPI-DSIM status types. */
-enum {
-	DSIM_STATE_INIT,	/* should be initialized. */
-	DSIM_STATE_STOP,	/* CPU and LCDC are LP mode. */
-	DSIM_STATE_HSCLKEN,	/* HS clock was enabled. */
-	DSIM_STATE_ULPS
-};
-
-/* define DSI lane types. */
-enum {
-	DSIM_LANE_CLOCK = (1 << 0),
-	DSIM_LANE_DATA0 = (1 << 1),
-	DSIM_LANE_DATA1 = (1 << 2),
-	DSIM_LANE_DATA2 = (1 << 3),
-	DSIM_LANE_DATA3 = (1 << 4)
-};
-
-static unsigned int dpll_table[15] = {
-	100, 120, 170, 220, 270,
-	320, 390, 450, 510, 560,
-	640, 690, 770, 870, 950
-};
-
-irqreturn_t exynos_mipi_dsi_interrupt_handler(int irq, void *dev_id)
-{
-	struct mipi_dsim_device *dsim = dev_id;
-	unsigned int intsrc, intmsk;
-
-	intsrc = exynos_mipi_dsi_read_interrupt(dsim);
-	intmsk = exynos_mipi_dsi_read_interrupt_mask(dsim);
-	intmsk = ~intmsk & intsrc;
-
-	if (intsrc & INTMSK_RX_DONE) {
-		complete(&dsim_rd_comp);
-		dev_dbg(dsim->dev, "MIPI INTMSK_RX_DONE\n");
-	}
-	if (intsrc & INTMSK_FIFO_EMPTY) {
-		complete(&dsim_wr_comp);
-		dev_dbg(dsim->dev, "MIPI INTMSK_FIFO_EMPTY\n");
-	}
-
-	exynos_mipi_dsi_clear_interrupt(dsim, intmsk);
-
-	return IRQ_HANDLED;
-}
-
-/*
- * write long packet to mipi dsi slave
- * @dsim: mipi dsim device structure.
- * @data0: packet data to send.
- * @data1: size of packet data
- */
-static void exynos_mipi_dsi_long_data_wr(struct mipi_dsim_device *dsim,
-		const unsigned char *data0, unsigned int data_size)
-{
-	unsigned int data_cnt = 0, payload = 0;
-
-	/* in case that data count is more then 4 */
-	for (data_cnt = 0; data_cnt < data_size; data_cnt += 4) {
-		/*
-		 * after sending 4bytes per one time,
-		 * send remainder data less then 4.
-		 */
-		if ((data_size - data_cnt) < 4) {
-			if ((data_size - data_cnt) == 3) {
-				payload = data0[data_cnt] |
-				    data0[data_cnt + 1] << 8 |
-					data0[data_cnt + 2] << 16;
-			dev_dbg(dsim->dev, "count = 3 payload = %x, %x %x %x\n",
-				payload, data0[data_cnt],
-				data0[data_cnt + 1],
-				data0[data_cnt + 2]);
-			} else if ((data_size - data_cnt) == 2) {
-				payload = data0[data_cnt] |
-					data0[data_cnt + 1] << 8;
-			dev_dbg(dsim->dev,
-				"count = 2 payload = %x, %x %x\n", payload,
-				data0[data_cnt],
-				data0[data_cnt + 1]);
-			} else if ((data_size - data_cnt) == 1) {
-				payload = data0[data_cnt];
-			}
-
-			exynos_mipi_dsi_wr_tx_data(dsim, payload);
-		/* send 4bytes per one time. */
-		} else {
-			payload = data0[data_cnt] |
-				data0[data_cnt + 1] << 8 |
-				data0[data_cnt + 2] << 16 |
-				data0[data_cnt + 3] << 24;
-
-			dev_dbg(dsim->dev,
-				"count = 4 payload = %x, %x %x %x %x\n",
-				payload, *(u8 *)(data0 + data_cnt),
-				data0[data_cnt + 1],
-				data0[data_cnt + 2],
-				data0[data_cnt + 3]);
-
-			exynos_mipi_dsi_wr_tx_data(dsim, payload);
-		}
-	}
-}
-
-int exynos_mipi_dsi_wr_data(struct mipi_dsim_device *dsim, unsigned int data_id,
-	const unsigned char *data0, unsigned int data_size)
-{
-	unsigned int check_rx_ack = 0;
-
-	if (dsim->state == DSIM_STATE_ULPS) {
-		dev_err(dsim->dev, "state is ULPS.\n");
-
-		return -EINVAL;
-	}
-
-	/* FIXME!!! why does it need this delay? */
-	msleep(20);
-
-	mutex_lock(&dsim->lock);
-
-	switch (data_id) {
-	/* short packet types of packet types for command. */
-	case MIPI_DSI_GENERIC_SHORT_WRITE_0_PARAM:
-	case MIPI_DSI_GENERIC_SHORT_WRITE_1_PARAM:
-	case MIPI_DSI_GENERIC_SHORT_WRITE_2_PARAM:
-	case MIPI_DSI_DCS_SHORT_WRITE:
-	case MIPI_DSI_DCS_SHORT_WRITE_PARAM:
-	case MIPI_DSI_SET_MAXIMUM_RETURN_PACKET_SIZE:
-		exynos_mipi_dsi_wr_tx_header(dsim, data_id, data0[0], data0[1]);
-		if (check_rx_ack) {
-			/* process response func should be implemented */
-			mutex_unlock(&dsim->lock);
-			return 0;
-		} else {
-			mutex_unlock(&dsim->lock);
-			return -EINVAL;
-		}
-
-	/* general command */
-	case MIPI_DSI_COLOR_MODE_OFF:
-	case MIPI_DSI_COLOR_MODE_ON:
-	case MIPI_DSI_SHUTDOWN_PERIPHERAL:
-	case MIPI_DSI_TURN_ON_PERIPHERAL:
-		exynos_mipi_dsi_wr_tx_header(dsim, data_id, data0[0], data0[1]);
-		if (check_rx_ack) {
-			/* process response func should be implemented. */
-			mutex_unlock(&dsim->lock);
-			return 0;
-		} else {
-			mutex_unlock(&dsim->lock);
-			return -EINVAL;
-		}
-
-	/* packet types for video data */
-	case MIPI_DSI_V_SYNC_START:
-	case MIPI_DSI_V_SYNC_END:
-	case MIPI_DSI_H_SYNC_START:
-	case MIPI_DSI_H_SYNC_END:
-	case MIPI_DSI_END_OF_TRANSMISSION:
-		mutex_unlock(&dsim->lock);
-		return 0;
-
-	/* long packet type and null packet */
-	case MIPI_DSI_NULL_PACKET:
-	case MIPI_DSI_BLANKING_PACKET:
-		mutex_unlock(&dsim->lock);
-		return 0;
-	case MIPI_DSI_GENERIC_LONG_WRITE:
-	case MIPI_DSI_DCS_LONG_WRITE:
-	{
-		unsigned int size, payload = 0;
-		reinit_completion(&dsim_wr_comp);
-
-		size = data_size * 4;
-
-		/* if data count is less then 4, then send 3bytes data.  */
-		if (data_size < 4) {
-			payload = data0[0] |
-				data0[1] << 8 |
-				data0[2] << 16;
-
-			exynos_mipi_dsi_wr_tx_data(dsim, payload);
-
-			dev_dbg(dsim->dev, "count = %d payload = %x,%x %x %x\n",
-				data_size, payload, data0[0],
-				data0[1], data0[2]);
-
-		/* in case that data count is more then 4 */
-		} else
-			exynos_mipi_dsi_long_data_wr(dsim, data0, data_size);
-
-		/* put data into header fifo */
-		exynos_mipi_dsi_wr_tx_header(dsim, data_id, data_size & 0xff,
-			(data_size & 0xff00) >> 8);
-
-		if (!wait_for_completion_interruptible_timeout(&dsim_wr_comp,
-							MIPI_FIFO_TIMEOUT)) {
-			dev_warn(dsim->dev, "command write timeout.\n");
-			mutex_unlock(&dsim->lock);
-			return -EAGAIN;
-		}
-
-		if (check_rx_ack) {
-			/* process response func should be implemented. */
-			mutex_unlock(&dsim->lock);
-			return 0;
-		} else {
-			mutex_unlock(&dsim->lock);
-			return -EINVAL;
-		}
-	}
-
-	/* packet typo for video data */
-	case MIPI_DSI_PACKED_PIXEL_STREAM_16:
-	case MIPI_DSI_PACKED_PIXEL_STREAM_18:
-	case MIPI_DSI_PIXEL_STREAM_3BYTE_18:
-	case MIPI_DSI_PACKED_PIXEL_STREAM_24:
-		if (check_rx_ack) {
-			/* process response func should be implemented. */
-			mutex_unlock(&dsim->lock);
-			return 0;
-		} else {
-			mutex_unlock(&dsim->lock);
-			return -EINVAL;
-		}
-	default:
-		dev_warn(dsim->dev,
-			"data id %x is not supported current DSI spec.\n",
-			data_id);
-
-		mutex_unlock(&dsim->lock);
-		return -EINVAL;
-	}
-}
-
-static unsigned int exynos_mipi_dsi_long_data_rd(struct mipi_dsim_device *dsim,
-		unsigned int req_size, unsigned int rx_data, u8 *rx_buf)
-{
-	unsigned int rcv_pkt, i, j;
-	u16 rxsize;
-
-	/* for long packet */
-	rxsize = (u16)((rx_data & 0x00ffff00) >> 8);
-	dev_dbg(dsim->dev, "mipi dsi rx size : %d\n", rxsize);
-	if (rxsize != req_size) {
-		dev_dbg(dsim->dev,
-			"received size mismatch received: %d, requested: %d\n",
-			rxsize, req_size);
-		goto err;
-	}
-
-	for (i = 0; i < (rxsize >> 2); i++) {
-		rcv_pkt = exynos_mipi_dsi_rd_rx_fifo(dsim);
-		dev_dbg(dsim->dev, "received pkt : %08x\n", rcv_pkt);
-		for (j = 0; j < 4; j++) {
-			rx_buf[(i * 4) + j] =
-					(u8)(rcv_pkt >> (j * 8)) & 0xff;
-			dev_dbg(dsim->dev, "received value : %02x\n",
-					(rcv_pkt >> (j * 8)) & 0xff);
-		}
-	}
-	if (rxsize % 4) {
-		rcv_pkt = exynos_mipi_dsi_rd_rx_fifo(dsim);
-		dev_dbg(dsim->dev, "received pkt : %08x\n", rcv_pkt);
-		for (j = 0; j < (rxsize % 4); j++) {
-			rx_buf[(i * 4) + j] =
-					(u8)(rcv_pkt >> (j * 8)) & 0xff;
-			dev_dbg(dsim->dev, "received value : %02x\n",
-					(rcv_pkt >> (j * 8)) & 0xff);
-		}
-	}
-
-	return rxsize;
-
-err:
-	return -EINVAL;
-}
-
-static unsigned int exynos_mipi_dsi_response_size(unsigned int req_size)
-{
-	switch (req_size) {
-	case 1:
-		return MIPI_DSI_RX_GENERIC_SHORT_READ_RESPONSE_1BYTE;
-	case 2:
-		return MIPI_DSI_RX_GENERIC_SHORT_READ_RESPONSE_2BYTE;
-	default:
-		return MIPI_DSI_RX_GENERIC_LONG_READ_RESPONSE;
-	}
-}
-
-int exynos_mipi_dsi_rd_data(struct mipi_dsim_device *dsim, unsigned int data_id,
-	unsigned int data0, unsigned int req_size, u8 *rx_buf)
-{
-	unsigned int rx_data, rcv_pkt, i;
-	u8 response = 0;
-	u16 rxsize;
-
-	if (dsim->state == DSIM_STATE_ULPS) {
-		dev_err(dsim->dev, "state is ULPS.\n");
-
-		return -EINVAL;
-	}
-
-	/* FIXME!!! */
-	msleep(20);
-
-	mutex_lock(&dsim->lock);
-	reinit_completion(&dsim_rd_comp);
-	exynos_mipi_dsi_rd_tx_header(dsim,
-		MIPI_DSI_SET_MAXIMUM_RETURN_PACKET_SIZE, req_size);
-
-	response = exynos_mipi_dsi_response_size(req_size);
-
-	switch (data_id) {
-	case MIPI_DSI_GENERIC_READ_REQUEST_0_PARAM:
-	case MIPI_DSI_GENERIC_READ_REQUEST_1_PARAM:
-	case MIPI_DSI_GENERIC_READ_REQUEST_2_PARAM:
-	case MIPI_DSI_DCS_READ:
-		exynos_mipi_dsi_rd_tx_header(dsim,
-			data_id, data0);
-		/* process response func should be implemented. */
-		break;
-	default:
-		dev_warn(dsim->dev,
-			"data id %x is not supported current DSI spec.\n",
-			data_id);
-
-		mutex_unlock(&dsim->lock);
-		return -EINVAL;
-	}
-
-	if (!wait_for_completion_interruptible_timeout(&dsim_rd_comp,
-				MIPI_FIFO_TIMEOUT)) {
-		pr_err("RX done interrupt timeout\n");
-		mutex_unlock(&dsim->lock);
-		return 0;
-	}
-
-	msleep(20);
-
-	rx_data = exynos_mipi_dsi_rd_rx_fifo(dsim);
-
-	if ((u8)(rx_data & 0xff) != response) {
-		printk(KERN_ERR
-			"mipi dsi wrong response rx_data : %x, response:%x\n",
-			rx_data, response);
-		goto clear_rx_fifo;
-	}
-
-	if (req_size <= 2) {
-		/* for short packet */
-		for (i = 0; i < req_size; i++)
-			rx_buf[i] = (rx_data >> (8 + (i * 8))) & 0xff;
-		rxsize = req_size;
-	} else {
-		/* for long packet */
-		rxsize = exynos_mipi_dsi_long_data_rd(dsim, req_size, rx_data,
-							rx_buf);
-		if (rxsize != req_size)
-			goto clear_rx_fifo;
-	}
-
-	rcv_pkt = exynos_mipi_dsi_rd_rx_fifo(dsim);
-
-	msleep(20);
-
-	if (rcv_pkt != MIPI_RX_FIFO_READ_DONE) {
-		dev_info(dsim->dev,
-			"Can't found RX FIFO READ DONE FLAG : %x\n", rcv_pkt);
-		goto clear_rx_fifo;
-	}
-
-	mutex_unlock(&dsim->lock);
-
-	return rxsize;
-
-clear_rx_fifo:
-	i = 0;
-	while (1) {
-		rcv_pkt = exynos_mipi_dsi_rd_rx_fifo(dsim);
-		if ((rcv_pkt == MIPI_RX_FIFO_READ_DONE)
-				|| (i > MIPI_MAX_RX_FIFO))
-			break;
-		dev_dbg(dsim->dev,
-				"mipi dsi clear rx fifo : %08x\n", rcv_pkt);
-		i++;
-	}
-	dev_info(dsim->dev,
-		"mipi dsi rx done count : %d, rcv_pkt : %08x\n", i, rcv_pkt);
-
-	mutex_unlock(&dsim->lock);
-
-	return 0;
-}
-
-static int exynos_mipi_dsi_pll_on(struct mipi_dsim_device *dsim,
-				unsigned int enable)
-{
-	int sw_timeout;
-
-	if (enable) {
-		sw_timeout = 1000;
-
-		exynos_mipi_dsi_enable_pll(dsim, 1);
-		while (1) {
-			sw_timeout--;
-			if (exynos_mipi_dsi_is_pll_stable(dsim))
-				return 0;
-			if (sw_timeout == 0)
-				return -EINVAL;
-		}
-	} else
-		exynos_mipi_dsi_enable_pll(dsim, 0);
-
-	return 0;
-}
-
-static unsigned long exynos_mipi_dsi_change_pll(struct mipi_dsim_device *dsim,
-	unsigned int pre_divider, unsigned int main_divider,
-	unsigned int scaler)
-{
-	unsigned long dfin_pll, dfvco, dpll_out;
-	unsigned int i, freq_band = 0xf;
-
-	dfin_pll = (FIN_HZ / pre_divider);
-
-	/******************************************************
-	 *	Serial Clock(=ByteClk X 8)	FreqBand[3:0] *
-	 ******************************************************
-	 *	~ 99.99 MHz			0000
-	 *	100 ~ 119.99 MHz		0001
-	 *	120 ~ 159.99 MHz		0010
-	 *	160 ~ 199.99 MHz		0011
-	 *	200 ~ 239.99 MHz		0100
-	 *	140 ~ 319.99 MHz		0101
-	 *	320 ~ 389.99 MHz		0110
-	 *	390 ~ 449.99 MHz		0111
-	 *	450 ~ 509.99 MHz		1000
-	 *	510 ~ 559.99 MHz		1001
-	 *	560 ~ 639.99 MHz		1010
-	 *	640 ~ 689.99 MHz		1011
-	 *	690 ~ 769.99 MHz		1100
-	 *	770 ~ 869.99 MHz		1101
-	 *	870 ~ 949.99 MHz		1110
-	 *	950 ~ 1000 MHz			1111
-	 ******************************************************/
-	if (dfin_pll < DFIN_PLL_MIN_HZ || dfin_pll > DFIN_PLL_MAX_HZ) {
-		dev_warn(dsim->dev, "fin_pll range should be 6MHz ~ 12MHz\n");
-		exynos_mipi_dsi_enable_afc(dsim, 0, 0);
-	} else {
-		if (dfin_pll < 7 * MHZ)
-			exynos_mipi_dsi_enable_afc(dsim, 1, 0x1);
-		else if (dfin_pll < 8 * MHZ)
-			exynos_mipi_dsi_enable_afc(dsim, 1, 0x0);
-		else if (dfin_pll < 9 * MHZ)
-			exynos_mipi_dsi_enable_afc(dsim, 1, 0x3);
-		else if (dfin_pll < 10 * MHZ)
-			exynos_mipi_dsi_enable_afc(dsim, 1, 0x2);
-		else if (dfin_pll < 11 * MHZ)
-			exynos_mipi_dsi_enable_afc(dsim, 1, 0x5);
-		else
-			exynos_mipi_dsi_enable_afc(dsim, 1, 0x4);
-	}
-
-	dfvco = dfin_pll * main_divider;
-	dev_dbg(dsim->dev, "dfvco = %lu, dfin_pll = %lu, main_divider = %d\n",
-				dfvco, dfin_pll, main_divider);
-	if (dfvco < DFVCO_MIN_HZ || dfvco > DFVCO_MAX_HZ)
-		dev_warn(dsim->dev, "fvco range should be 500MHz ~ 1000MHz\n");
-
-	dpll_out = dfvco / (1 << scaler);
-	dev_dbg(dsim->dev, "dpll_out = %lu, dfvco = %lu, scaler = %d\n",
-		dpll_out, dfvco, scaler);
-
-	for (i = 0; i < ARRAY_SIZE(dpll_table); i++) {
-		if (dpll_out < dpll_table[i] * MHZ) {
-			freq_band = i;
-			break;
-		}
-	}
-
-	dev_dbg(dsim->dev, "freq_band = %d\n", freq_band);
-
-	exynos_mipi_dsi_pll_freq(dsim, pre_divider, main_divider, scaler);
-
-	exynos_mipi_dsi_hs_zero_ctrl(dsim, 0);
-	exynos_mipi_dsi_prep_ctrl(dsim, 0);
-
-	/* Freq Band */
-	exynos_mipi_dsi_pll_freq_band(dsim, freq_band);
-
-	/* Stable time */
-	exynos_mipi_dsi_pll_stable_time(dsim, dsim->dsim_config->pll_stable_time);
-
-	/* Enable PLL */
-	dev_dbg(dsim->dev, "FOUT of mipi dphy pll is %luMHz\n",
-		(dpll_out / MHZ));
-
-	return dpll_out;
-}
-
-static int exynos_mipi_dsi_set_clock(struct mipi_dsim_device *dsim,
-	unsigned int byte_clk_sel, unsigned int enable)
-{
-	unsigned int esc_div;
-	unsigned long esc_clk_error_rate;
-	unsigned long hs_clk = 0, byte_clk = 0, escape_clk = 0;
-
-	if (enable) {
-		dsim->e_clk_src = byte_clk_sel;
-
-		/* Escape mode clock and byte clock source */
-		exynos_mipi_dsi_set_byte_clock_src(dsim, byte_clk_sel);
-
-		/* DPHY, DSIM Link : D-PHY clock out */
-		if (byte_clk_sel == DSIM_PLL_OUT_DIV8) {
-			hs_clk = exynos_mipi_dsi_change_pll(dsim,
-				dsim->dsim_config->p, dsim->dsim_config->m,
-				dsim->dsim_config->s);
-			if (hs_clk == 0) {
-				dev_err(dsim->dev,
-					"failed to get hs clock.\n");
-				return -EINVAL;
-			}
-
-			byte_clk = hs_clk / 8;
-			exynos_mipi_dsi_enable_pll_bypass(dsim, 0);
-			exynos_mipi_dsi_pll_on(dsim, 1);
-		/* DPHY : D-PHY clock out, DSIM link : external clock out */
-		} else if (byte_clk_sel == DSIM_EXT_CLK_DIV8) {
-			dev_warn(dsim->dev, "this project is not support\n");
-			dev_warn(dsim->dev,
-				"external clock source for MIPI DSIM.\n");
-		} else if (byte_clk_sel == DSIM_EXT_CLK_BYPASS) {
-			dev_warn(dsim->dev, "this project is not support\n");
-			dev_warn(dsim->dev,
-				"external clock source for MIPI DSIM\n");
-		}
-
-		/* escape clock divider */
-		esc_div = byte_clk / (dsim->dsim_config->esc_clk);
-		dev_dbg(dsim->dev,
-			"esc_div = %d, byte_clk = %lu, esc_clk = %lu\n",
-			esc_div, byte_clk, dsim->dsim_config->esc_clk);
-		if ((byte_clk / esc_div) >= (20 * MHZ) ||
-				(byte_clk / esc_div) >
-					dsim->dsim_config->esc_clk)
-			esc_div += 1;
-
-		escape_clk = byte_clk / esc_div;
-		dev_dbg(dsim->dev,
-			"escape_clk = %lu, byte_clk = %lu, esc_div = %d\n",
-			escape_clk, byte_clk, esc_div);
-
-		/* enable escape clock. */
-		exynos_mipi_dsi_enable_byte_clock(dsim, 1);
-
-		/* enable byte clk and escape clock */
-		exynos_mipi_dsi_set_esc_clk_prs(dsim, 1, esc_div);
-		/* escape clock on lane */
-		exynos_mipi_dsi_enable_esc_clk_on_lane(dsim,
-			(DSIM_LANE_CLOCK | dsim->data_lane), 1);
-
-		dev_dbg(dsim->dev, "byte clock is %luMHz\n",
-			(byte_clk / MHZ));
-		dev_dbg(dsim->dev, "escape clock that user's need is %lu\n",
-			(dsim->dsim_config->esc_clk / MHZ));
-		dev_dbg(dsim->dev, "escape clock divider is %x\n", esc_div);
-		dev_dbg(dsim->dev, "escape clock is %luMHz\n",
-			((byte_clk / esc_div) / MHZ));
-
-		if ((byte_clk / esc_div) > escape_clk) {
-			esc_clk_error_rate = escape_clk /
-				(byte_clk / esc_div);
-			dev_warn(dsim->dev, "error rate is %lu over.\n",
-				(esc_clk_error_rate / 100));
-		} else if ((byte_clk / esc_div) < (escape_clk)) {
-			esc_clk_error_rate = (byte_clk / esc_div) /
-				escape_clk;
-			dev_warn(dsim->dev, "error rate is %lu under.\n",
-				(esc_clk_error_rate / 100));
-		}
-	} else {
-		exynos_mipi_dsi_enable_esc_clk_on_lane(dsim,
-			(DSIM_LANE_CLOCK | dsim->data_lane), 0);
-		exynos_mipi_dsi_set_esc_clk_prs(dsim, 0, 0);
-
-		/* disable escape clock. */
-		exynos_mipi_dsi_enable_byte_clock(dsim, 0);
-
-		if (byte_clk_sel == DSIM_PLL_OUT_DIV8)
-			exynos_mipi_dsi_pll_on(dsim, 0);
-	}
-
-	return 0;
-}
-
-int exynos_mipi_dsi_init_dsim(struct mipi_dsim_device *dsim)
-{
-	dsim->state = DSIM_STATE_INIT;
-
-	switch (dsim->dsim_config->e_no_data_lane) {
-	case DSIM_DATA_LANE_1:
-		dsim->data_lane = DSIM_LANE_DATA0;
-		break;
-	case DSIM_DATA_LANE_2:
-		dsim->data_lane = DSIM_LANE_DATA0 | DSIM_LANE_DATA1;
-		break;
-	case DSIM_DATA_LANE_3:
-		dsim->data_lane = DSIM_LANE_DATA0 | DSIM_LANE_DATA1 |
-			DSIM_LANE_DATA2;
-		break;
-	case DSIM_DATA_LANE_4:
-		dsim->data_lane = DSIM_LANE_DATA0 | DSIM_LANE_DATA1 |
-			DSIM_LANE_DATA2 | DSIM_LANE_DATA3;
-		break;
-	default:
-		dev_info(dsim->dev, "data lane is invalid.\n");
-		return -EINVAL;
-	}
-
-	exynos_mipi_dsi_sw_reset(dsim);
-	exynos_mipi_dsi_func_reset(dsim);
-
-	exynos_mipi_dsi_dp_dn_swap(dsim, 0);
-
-	return 0;
-}
-
-void exynos_mipi_dsi_init_interrupt(struct mipi_dsim_device *dsim)
-{
-	unsigned int src = 0;
-
-	src = (INTSRC_SFR_FIFO_EMPTY | INTSRC_RX_DATA_DONE);
-	exynos_mipi_dsi_set_interrupt(dsim, src, 1);
-
-	src = 0;
-	src = ~(INTMSK_RX_DONE | INTMSK_FIFO_EMPTY);
-	exynos_mipi_dsi_set_interrupt_mask(dsim, src, 1);
-}
-
-int exynos_mipi_dsi_enable_frame_done_int(struct mipi_dsim_device *dsim,
-	unsigned int enable)
-{
-	/* enable only frame done interrupt */
-	exynos_mipi_dsi_set_interrupt_mask(dsim, INTMSK_FRAME_DONE, enable);
-
-	return 0;
-}
-
-void exynos_mipi_dsi_stand_by(struct mipi_dsim_device *dsim,
-		unsigned int enable)
-{
-
-	/* consider Main display and Sub display. */
-
-	exynos_mipi_dsi_set_main_stand_by(dsim, enable);
-}
-
-int exynos_mipi_dsi_set_display_mode(struct mipi_dsim_device *dsim,
-	struct mipi_dsim_config *dsim_config)
-{
-	struct mipi_dsim_platform_data *dsim_pd;
-	struct fb_videomode *timing;
-
-	dsim_pd = (struct mipi_dsim_platform_data *)dsim->pd;
-	timing = (struct fb_videomode *)dsim_pd->lcd_panel_info;
-
-	/* in case of VIDEO MODE (RGB INTERFACE), it sets polarities. */
-	if (dsim_config->e_interface == (u32) DSIM_VIDEO) {
-		if (dsim_config->auto_vertical_cnt == 0) {
-			exynos_mipi_dsi_set_main_disp_vporch(dsim,
-				dsim_config->cmd_allow,
-				timing->lower_margin,
-				timing->upper_margin);
-			exynos_mipi_dsi_set_main_disp_hporch(dsim,
-				timing->right_margin,
-				timing->left_margin);
-			exynos_mipi_dsi_set_main_disp_sync_area(dsim,
-				timing->vsync_len,
-				timing->hsync_len);
-		}
-	}
-
-	exynos_mipi_dsi_set_main_disp_resol(dsim, timing->xres,
-			timing->yres);
-
-	exynos_mipi_dsi_display_config(dsim, dsim_config);
-
-	dev_info(dsim->dev, "lcd panel ==> width = %d, height = %d\n",
-			timing->xres, timing->yres);
-
-	return 0;
-}
-
-int exynos_mipi_dsi_init_link(struct mipi_dsim_device *dsim)
-{
-	unsigned int time_out = 100;
-
-	switch (dsim->state) {
-	case DSIM_STATE_INIT:
-		exynos_mipi_dsi_init_fifo_pointer(dsim, 0x1f);
-
-		/* dsi configuration */
-		exynos_mipi_dsi_init_config(dsim);
-		exynos_mipi_dsi_enable_lane(dsim, DSIM_LANE_CLOCK, 1);
-		exynos_mipi_dsi_enable_lane(dsim, dsim->data_lane, 1);
-
-		/* set clock configuration */
-		exynos_mipi_dsi_set_clock(dsim, dsim->dsim_config->e_byte_clk, 1);
-
-		/* check clock and data lane state are stop state */
-		while (!(exynos_mipi_dsi_is_lane_state(dsim))) {
-			time_out--;
-			if (time_out == 0) {
-				dev_err(dsim->dev,
-					"DSI Master is not stop state.\n");
-				dev_err(dsim->dev,
-					"Check initialization process\n");
-
-				return -EINVAL;
-			}
-		}
-		if (time_out != 0) {
-			dev_info(dsim->dev,
-				"DSI Master driver has been completed.\n");
-			dev_info(dsim->dev, "DSI Master state is stop state\n");
-		}
-
-		dsim->state = DSIM_STATE_STOP;
-
-		/* BTA sequence counters */
-		exynos_mipi_dsi_set_stop_state_counter(dsim,
-			dsim->dsim_config->stop_holding_cnt);
-		exynos_mipi_dsi_set_bta_timeout(dsim,
-			dsim->dsim_config->bta_timeout);
-		exynos_mipi_dsi_set_lpdr_timeout(dsim,
-			dsim->dsim_config->rx_timeout);
-
-		return 0;
-	default:
-		dev_info(dsim->dev, "DSI Master is already init.\n");
-		return 0;
-	}
-
-	return 0;
-}
-
-int exynos_mipi_dsi_set_hs_enable(struct mipi_dsim_device *dsim)
-{
-	if (dsim->state != DSIM_STATE_STOP) {
-		dev_warn(dsim->dev, "DSIM is not in stop state.\n");
-		return 0;
-	}
-
-	if (dsim->e_clk_src == DSIM_EXT_CLK_BYPASS) {
-		dev_warn(dsim->dev, "clock source is external bypass.\n");
-		return 0;
-	}
-
-	dsim->state = DSIM_STATE_HSCLKEN;
-
-	 /* set LCDC and CPU transfer mode to HS. */
-	exynos_mipi_dsi_set_lcdc_transfer_mode(dsim, 0);
-	exynos_mipi_dsi_set_cpu_transfer_mode(dsim, 0);
-	exynos_mipi_dsi_enable_hs_clock(dsim, 1);
-
-	return 0;
-}
-
-int exynos_mipi_dsi_set_data_transfer_mode(struct mipi_dsim_device *dsim,
-		unsigned int mode)
-{
-	if (mode) {
-		if (dsim->state != DSIM_STATE_HSCLKEN) {
-			dev_err(dsim->dev, "HS Clock lane is not enabled.\n");
-			return -EINVAL;
-		}
-
-		exynos_mipi_dsi_set_lcdc_transfer_mode(dsim, 0);
-	} else {
-		if (dsim->state == DSIM_STATE_INIT || dsim->state ==
-			DSIM_STATE_ULPS) {
-			dev_err(dsim->dev,
-				"DSI Master is not STOP or HSDT state.\n");
-			return -EINVAL;
-		}
-
-		exynos_mipi_dsi_set_cpu_transfer_mode(dsim, 0);
-	}
-
-	return 0;
-}
-
-int exynos_mipi_dsi_get_frame_done_status(struct mipi_dsim_device *dsim)
-{
-	return _exynos_mipi_dsi_get_frame_done_status(dsim);
-}
-
-int exynos_mipi_dsi_clear_frame_done(struct mipi_dsim_device *dsim)
-{
-	_exynos_mipi_dsi_clear_frame_done(dsim);
-
-	return 0;
-}
-
-int exynos_mipi_dsi_fifo_clear(struct mipi_dsim_device *dsim,
-				unsigned int val)
-{
-	int try = TRY_FIFO_CLEAR;
-
-	exynos_mipi_dsi_sw_reset_release(dsim);
-	exynos_mipi_dsi_func_reset(dsim);
-
-	do {
-		if (exynos_mipi_dsi_get_sw_reset_release(dsim)) {
-			exynos_mipi_dsi_init_interrupt(dsim);
-			dev_dbg(dsim->dev, "reset release done.\n");
-			return 0;
-		}
-	} while (--try);
-
-	dev_err(dsim->dev, "failed to clear dsim fifo.\n");
-	return -EAGAIN;
-}
-
-MODULE_AUTHOR("InKi Dae <inki.dae@samsung.com>");
-MODULE_DESCRIPTION("Samsung SoC MIPI-DSI common driver");
-MODULE_LICENSE("GPL");
diff --git a/drivers/video/fbdev/exynos/exynos_mipi_dsi_common.h b/drivers/video/fbdev/exynos/exynos_mipi_dsi_common.h
deleted file mode 100644
index 412552274df3..000000000000
--- a/drivers/video/fbdev/exynos/exynos_mipi_dsi_common.h
+++ /dev/null
@@ -1,46 +0,0 @@
-/* linux/drivers/video/exynos_mipi_dsi_common.h
- *
- * Header file for Samsung SoC MIPI-DSI common driver.
- *
- * Copyright (c) 2012 Samsung Electronics Co., Ltd
- *
- * InKi Dae <inki.dae@samsung.com>
- * Donghwa Lee <dh09.lee@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#ifndef _EXYNOS_MIPI_DSI_COMMON_H
-#define _EXYNOS_MIPI_DSI_COMMON_H
-
-static DECLARE_COMPLETION(dsim_rd_comp);
-static DECLARE_COMPLETION(dsim_wr_comp);
-
-int exynos_mipi_dsi_wr_data(struct mipi_dsim_device *dsim, unsigned int data_id,
-	const unsigned char *data0, unsigned int data_size);
-int exynos_mipi_dsi_rd_data(struct mipi_dsim_device *dsim, unsigned int data_id,
-	unsigned int data0, unsigned int req_size, u8 *rx_buf);
-irqreturn_t exynos_mipi_dsi_interrupt_handler(int irq, void *dev_id);
-void exynos_mipi_dsi_init_interrupt(struct mipi_dsim_device *dsim);
-int exynos_mipi_dsi_init_dsim(struct mipi_dsim_device *dsim);
-void exynos_mipi_dsi_stand_by(struct mipi_dsim_device *dsim,
-		unsigned int enable);
-int exynos_mipi_dsi_set_display_mode(struct mipi_dsim_device *dsim,
-			struct mipi_dsim_config *dsim_info);
-int exynos_mipi_dsi_init_link(struct mipi_dsim_device *dsim);
-int exynos_mipi_dsi_set_hs_enable(struct mipi_dsim_device *dsim);
-int exynos_mipi_dsi_set_data_transfer_mode(struct mipi_dsim_device *dsim,
-		unsigned int mode);
-int exynos_mipi_dsi_enable_frame_done_int(struct mipi_dsim_device *dsim,
-	unsigned int enable);
-int exynos_mipi_dsi_get_frame_done_status(struct mipi_dsim_device *dsim);
-int exynos_mipi_dsi_clear_frame_done(struct mipi_dsim_device *dsim);
-
-extern struct fb_info *registered_fb[FB_MAX] __read_mostly;
-
-int exynos_mipi_dsi_fifo_clear(struct mipi_dsim_device *dsim,
-				unsigned int val);
-
-#endif /* _EXYNOS_MIPI_DSI_COMMON_H */
diff --git a/drivers/video/fbdev/exynos/exynos_mipi_dsi_lowlevel.c b/drivers/video/fbdev/exynos/exynos_mipi_dsi_lowlevel.c
deleted file mode 100644
index c148d06540c1..000000000000
--- a/drivers/video/fbdev/exynos/exynos_mipi_dsi_lowlevel.c
+++ /dev/null
@@ -1,618 +0,0 @@
-/* linux/drivers/video/exynos/exynos_mipi_dsi_lowlevel.c
- *
- * Samsung SoC MIPI-DSI lowlevel driver.
- *
- * Copyright (c) 2012 Samsung Electronics Co., Ltd
- *
- * InKi Dae, <inki.dae@samsung.com>
- * Donghwa Lee, <dh09.lee@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/mutex.h>
-#include <linux/wait.h>
-#include <linux/delay.h>
-#include <linux/fs.h>
-#include <linux/mm.h>
-#include <linux/ctype.h>
-#include <linux/platform_device.h>
-#include <linux/io.h>
-
-#include <video/exynos_mipi_dsim.h>
-
-#include "exynos_mipi_dsi_regs.h"
-#include "exynos_mipi_dsi_lowlevel.h"
-
-void exynos_mipi_dsi_func_reset(struct mipi_dsim_device *dsim)
-{
-	unsigned int reg;
-
-	reg = readl(dsim->reg_base + EXYNOS_DSIM_SWRST);
-
-	reg |= DSIM_FUNCRST;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_SWRST);
-}
-
-void exynos_mipi_dsi_sw_reset(struct mipi_dsim_device *dsim)
-{
-	unsigned int reg;
-
-	reg = readl(dsim->reg_base + EXYNOS_DSIM_SWRST);
-
-	reg |= DSIM_SWRST;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_SWRST);
-}
-
-void exynos_mipi_dsi_sw_reset_release(struct mipi_dsim_device *dsim)
-{
-	unsigned int reg;
-
-	reg = readl(dsim->reg_base + EXYNOS_DSIM_INTSRC);
-
-	reg |= INTSRC_SW_RST_RELEASE;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_INTSRC);
-}
-
-int exynos_mipi_dsi_get_sw_reset_release(struct mipi_dsim_device *dsim)
-{
-	return (readl(dsim->reg_base + EXYNOS_DSIM_INTSRC)) &
-			INTSRC_SW_RST_RELEASE;
-}
-
-unsigned int exynos_mipi_dsi_read_interrupt_mask(struct mipi_dsim_device *dsim)
-{
-	unsigned int reg;
-
-	reg = readl(dsim->reg_base + EXYNOS_DSIM_INTMSK);
-
-	return reg;
-}
-
-void exynos_mipi_dsi_set_interrupt_mask(struct mipi_dsim_device *dsim,
-		unsigned int mode, unsigned int mask)
-{
-	unsigned int reg = 0;
-
-	if (mask)
-		reg |= mode;
-	else
-		reg &= ~mode;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_INTMSK);
-}
-
-void exynos_mipi_dsi_init_fifo_pointer(struct mipi_dsim_device *dsim,
-		unsigned int cfg)
-{
-	unsigned int reg;
-
-	reg = readl(dsim->reg_base + EXYNOS_DSIM_FIFOCTRL);
-
-	writel(reg & ~(cfg), dsim->reg_base + EXYNOS_DSIM_FIFOCTRL);
-	mdelay(10);
-	reg |= cfg;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_FIFOCTRL);
-}
-
-/*
- * this function set PLL P, M and S value in D-PHY
- */
-void exynos_mipi_dsi_set_phy_tunning(struct mipi_dsim_device *dsim,
-		unsigned int value)
-{
-	writel(DSIM_AFC_CTL(value), dsim->reg_base + EXYNOS_DSIM_PHYACCHR);
-}
-
-void exynos_mipi_dsi_set_main_stand_by(struct mipi_dsim_device *dsim,
-		unsigned int enable)
-{
-	unsigned int reg;
-
-	reg = readl(dsim->reg_base + EXYNOS_DSIM_MDRESOL);
-
-	reg &= ~DSIM_MAIN_STAND_BY;
-
-	if (enable)
-		reg |= DSIM_MAIN_STAND_BY;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_MDRESOL);
-}
-
-void exynos_mipi_dsi_set_main_disp_resol(struct mipi_dsim_device *dsim,
-	unsigned int width_resol, unsigned int height_resol)
-{
-	unsigned int reg;
-
-	/* standby should be set after configuration so set to not ready*/
-	reg = (readl(dsim->reg_base + EXYNOS_DSIM_MDRESOL)) &
-		~(DSIM_MAIN_STAND_BY);
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_MDRESOL);
-
-	reg &= ~((0x7ff << 16) | (0x7ff << 0));
-	reg |= DSIM_MAIN_VRESOL(height_resol) | DSIM_MAIN_HRESOL(width_resol);
-
-	reg |= DSIM_MAIN_STAND_BY;
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_MDRESOL);
-}
-
-void exynos_mipi_dsi_set_main_disp_vporch(struct mipi_dsim_device *dsim,
-	unsigned int cmd_allow, unsigned int vfront, unsigned int vback)
-{
-	unsigned int reg;
-
-	reg = (readl(dsim->reg_base + EXYNOS_DSIM_MVPORCH)) &
-		~((DSIM_CMD_ALLOW_MASK) | (DSIM_STABLE_VFP_MASK) |
-		(DSIM_MAIN_VBP_MASK));
-
-	reg |= (DSIM_CMD_ALLOW_SHIFT(cmd_allow & 0xf) |
-		DSIM_STABLE_VFP_SHIFT(vfront & 0x7ff) |
-		DSIM_MAIN_VBP_SHIFT(vback & 0x7ff));
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_MVPORCH);
-}
-
-void exynos_mipi_dsi_set_main_disp_hporch(struct mipi_dsim_device *dsim,
-	unsigned int front, unsigned int back)
-{
-	unsigned int reg;
-
-	reg = (readl(dsim->reg_base + EXYNOS_DSIM_MHPORCH)) &
-		~((DSIM_MAIN_HFP_MASK) | (DSIM_MAIN_HBP_MASK));
-
-	reg |= DSIM_MAIN_HFP_SHIFT(front) | DSIM_MAIN_HBP_SHIFT(back);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_MHPORCH);
-}
-
-void exynos_mipi_dsi_set_main_disp_sync_area(struct mipi_dsim_device *dsim,
-	unsigned int vert, unsigned int hori)
-{
-	unsigned int reg;
-
-	reg = (readl(dsim->reg_base + EXYNOS_DSIM_MSYNC)) &
-		~((DSIM_MAIN_VSA_MASK) | (DSIM_MAIN_HSA_MASK));
-
-	reg |= (DSIM_MAIN_VSA_SHIFT(vert & 0x3ff) |
-		DSIM_MAIN_HSA_SHIFT(hori));
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_MSYNC);
-}
-
-void exynos_mipi_dsi_set_sub_disp_resol(struct mipi_dsim_device *dsim,
-	unsigned int vert, unsigned int hori)
-{
-	unsigned int reg;
-
-	reg = (readl(dsim->reg_base + EXYNOS_DSIM_SDRESOL)) &
-		~(DSIM_SUB_STANDY_MASK);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_SDRESOL);
-
-	reg &= ~(DSIM_SUB_VRESOL_MASK) | ~(DSIM_SUB_HRESOL_MASK);
-	reg |= (DSIM_SUB_VRESOL_SHIFT(vert & 0x7ff) |
-		DSIM_SUB_HRESOL_SHIFT(hori & 0x7ff));
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_SDRESOL);
-
-	reg |= DSIM_SUB_STANDY_SHIFT(1);
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_SDRESOL);
-}
-
-void exynos_mipi_dsi_init_config(struct mipi_dsim_device *dsim)
-{
-	struct mipi_dsim_config *dsim_config = dsim->dsim_config;
-
-	unsigned int cfg = (readl(dsim->reg_base + EXYNOS_DSIM_CONFIG)) &
-		~((1 << 28) | (0x1f << 20) | (0x3 << 5));
-
-	cfg =	((DSIM_AUTO_FLUSH(dsim_config->auto_flush)) |
-		(DSIM_EOT_DISABLE(dsim_config->eot_disable)) |
-		(DSIM_AUTO_MODE_SHIFT(dsim_config->auto_vertical_cnt)) |
-		(DSIM_HSE_MODE_SHIFT(dsim_config->hse)) |
-		(DSIM_HFP_MODE_SHIFT(dsim_config->hfp)) |
-		(DSIM_HBP_MODE_SHIFT(dsim_config->hbp)) |
-		(DSIM_HSA_MODE_SHIFT(dsim_config->hsa)) |
-		(DSIM_NUM_OF_DATALANE_SHIFT(dsim_config->e_no_data_lane)));
-
-	writel(cfg, dsim->reg_base + EXYNOS_DSIM_CONFIG);
-}
-
-void exynos_mipi_dsi_display_config(struct mipi_dsim_device *dsim,
-				struct mipi_dsim_config *dsim_config)
-{
-	u32 reg = (readl(dsim->reg_base + EXYNOS_DSIM_CONFIG)) &
-		~((0x3 << 26) | (1 << 25) | (0x3 << 18) | (0x7 << 12) |
-		(0x3 << 16) | (0x7 << 8));
-
-	if (dsim_config->e_interface == DSIM_VIDEO)
-		reg |= (1 << 25);
-	else if (dsim_config->e_interface == DSIM_COMMAND)
-		reg &= ~(1 << 25);
-	else {
-		dev_err(dsim->dev, "unknown lcd type.\n");
-		return;
-	}
-
-	/* main lcd */
-	reg |= ((u8) (dsim_config->e_burst_mode) & 0x3) << 26 |
-		((u8) (dsim_config->e_virtual_ch) & 0x3) << 18 |
-		((u8) (dsim_config->e_pixel_format) & 0x7) << 12;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_CONFIG);
-}
-
-void exynos_mipi_dsi_enable_lane(struct mipi_dsim_device *dsim, unsigned int lane,
-	unsigned int enable)
-{
-	unsigned int reg;
-
-	reg = readl(dsim->reg_base + EXYNOS_DSIM_CONFIG);
-
-	if (enable)
-		reg |= DSIM_LANE_ENx(lane);
-	else
-		reg &= ~DSIM_LANE_ENx(lane);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_CONFIG);
-}
-
-
-void exynos_mipi_dsi_set_data_lane_number(struct mipi_dsim_device *dsim,
-	unsigned int count)
-{
-	unsigned int cfg;
-
-	/* get the data lane number. */
-	cfg = DSIM_NUM_OF_DATALANE_SHIFT(count);
-
-	writel(cfg, dsim->reg_base + EXYNOS_DSIM_CONFIG);
-}
-
-void exynos_mipi_dsi_enable_afc(struct mipi_dsim_device *dsim, unsigned int enable,
-	unsigned int afc_code)
-{
-	unsigned int reg = readl(dsim->reg_base + EXYNOS_DSIM_PHYACCHR);
-
-	if (enable) {
-		reg |= (1 << 14);
-		reg &= ~(0x7 << 5);
-		reg |= (afc_code & 0x7) << 5;
-	} else
-		reg &= ~(1 << 14);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_PHYACCHR);
-}
-
-void exynos_mipi_dsi_enable_pll_bypass(struct mipi_dsim_device *dsim,
-	unsigned int enable)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_CLKCTRL)) &
-		~(DSIM_PLL_BYPASS_SHIFT(0x1));
-
-	reg |= DSIM_PLL_BYPASS_SHIFT(enable);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_CLKCTRL);
-}
-
-void exynos_mipi_dsi_set_pll_pms(struct mipi_dsim_device *dsim, unsigned int p,
-	unsigned int m, unsigned int s)
-{
-	unsigned int reg = readl(dsim->reg_base + EXYNOS_DSIM_PLLCTRL);
-
-	reg |= ((p & 0x3f) << 13) | ((m & 0x1ff) << 4) | ((s & 0x7) << 1);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_PLLCTRL);
-}
-
-void exynos_mipi_dsi_pll_freq_band(struct mipi_dsim_device *dsim,
-		unsigned int freq_band)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_PLLCTRL)) &
-		~(DSIM_FREQ_BAND_SHIFT(0x1f));
-
-	reg |= DSIM_FREQ_BAND_SHIFT(freq_band & 0x1f);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_PLLCTRL);
-}
-
-void exynos_mipi_dsi_pll_freq(struct mipi_dsim_device *dsim,
-		unsigned int pre_divider, unsigned int main_divider,
-		unsigned int scaler)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_PLLCTRL)) &
-		~(0x7ffff << 1);
-
-	reg |= (pre_divider & 0x3f) << 13 | (main_divider & 0x1ff) << 4 |
-		(scaler & 0x7) << 1;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_PLLCTRL);
-}
-
-void exynos_mipi_dsi_pll_stable_time(struct mipi_dsim_device *dsim,
-	unsigned int lock_time)
-{
-	writel(lock_time, dsim->reg_base + EXYNOS_DSIM_PLLTMR);
-}
-
-void exynos_mipi_dsi_enable_pll(struct mipi_dsim_device *dsim, unsigned int enable)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_PLLCTRL)) &
-		~(DSIM_PLL_EN_SHIFT(0x1));
-
-	reg |= DSIM_PLL_EN_SHIFT(enable & 0x1);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_PLLCTRL);
-}
-
-void exynos_mipi_dsi_set_byte_clock_src(struct mipi_dsim_device *dsim,
-		unsigned int src)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_CLKCTRL)) &
-		~(DSIM_BYTE_CLK_SRC_SHIFT(0x3));
-
-	reg |= (DSIM_BYTE_CLK_SRC_SHIFT(src));
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_CLKCTRL);
-}
-
-void exynos_mipi_dsi_enable_byte_clock(struct mipi_dsim_device *dsim,
-		unsigned int enable)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_CLKCTRL)) &
-		~(DSIM_BYTE_CLKEN_SHIFT(0x1));
-
-	reg |= DSIM_BYTE_CLKEN_SHIFT(enable);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_CLKCTRL);
-}
-
-void exynos_mipi_dsi_set_esc_clk_prs(struct mipi_dsim_device *dsim,
-		unsigned int enable, unsigned int prs_val)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_CLKCTRL)) &
-		~(DSIM_ESC_CLKEN_SHIFT(0x1) | 0xffff);
-
-	reg |= DSIM_ESC_CLKEN_SHIFT(enable);
-	if (enable)
-		reg |= prs_val;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_CLKCTRL);
-}
-
-void exynos_mipi_dsi_enable_esc_clk_on_lane(struct mipi_dsim_device *dsim,
-		unsigned int lane_sel, unsigned int enable)
-{
-	unsigned int reg = readl(dsim->reg_base + EXYNOS_DSIM_CLKCTRL);
-
-	if (enable)
-		reg |= DSIM_LANE_ESC_CLKEN(lane_sel);
-	else
-
-		reg &= ~DSIM_LANE_ESC_CLKEN(lane_sel);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_CLKCTRL);
-}
-
-void exynos_mipi_dsi_force_dphy_stop_state(struct mipi_dsim_device *dsim,
-	unsigned int enable)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_ESCMODE)) &
-		~(DSIM_FORCE_STOP_STATE_SHIFT(0x1));
-
-	reg |= (DSIM_FORCE_STOP_STATE_SHIFT(enable & 0x1));
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_ESCMODE);
-}
-
-unsigned int exynos_mipi_dsi_is_lane_state(struct mipi_dsim_device *dsim)
-{
-	unsigned int reg = readl(dsim->reg_base + EXYNOS_DSIM_STATUS);
-
-	/**
-	 * check clock and data lane states.
-	 * if MIPI-DSI controller was enabled at bootloader then
-	 * TX_READY_HS_CLK is enabled otherwise STOP_STATE_CLK.
-	 * so it should be checked for two case.
-	 */
-	if ((reg & DSIM_STOP_STATE_DAT(0xf)) &&
-			((reg & DSIM_STOP_STATE_CLK) ||
-			 (reg & DSIM_TX_READY_HS_CLK)))
-		return 1;
-
-	return 0;
-}
-
-void exynos_mipi_dsi_set_stop_state_counter(struct mipi_dsim_device *dsim,
-		unsigned int cnt_val)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_ESCMODE)) &
-		~(DSIM_STOP_STATE_CNT_SHIFT(0x7ff));
-
-	reg |= (DSIM_STOP_STATE_CNT_SHIFT(cnt_val & 0x7ff));
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_ESCMODE);
-}
-
-void exynos_mipi_dsi_set_bta_timeout(struct mipi_dsim_device *dsim,
-		unsigned int timeout)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_TIMEOUT)) &
-		~(DSIM_BTA_TOUT_SHIFT(0xff));
-
-	reg |= (DSIM_BTA_TOUT_SHIFT(timeout));
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_TIMEOUT);
-}
-
-void exynos_mipi_dsi_set_lpdr_timeout(struct mipi_dsim_device *dsim,
-		unsigned int timeout)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_TIMEOUT)) &
-		~(DSIM_LPDR_TOUT_SHIFT(0xffff));
-
-	reg |= (DSIM_LPDR_TOUT_SHIFT(timeout));
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_TIMEOUT);
-}
-
-void exynos_mipi_dsi_set_cpu_transfer_mode(struct mipi_dsim_device *dsim,
-		unsigned int lp)
-{
-	unsigned int reg = readl(dsim->reg_base + EXYNOS_DSIM_ESCMODE);
-
-	reg &= ~DSIM_CMD_LPDT_LP;
-
-	if (lp)
-		reg |= DSIM_CMD_LPDT_LP;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_ESCMODE);
-}
-
-void exynos_mipi_dsi_set_lcdc_transfer_mode(struct mipi_dsim_device *dsim,
-		unsigned int lp)
-{
-	unsigned int reg = readl(dsim->reg_base + EXYNOS_DSIM_ESCMODE);
-
-	reg &= ~DSIM_TX_LPDT_LP;
-
-	if (lp)
-		reg |= DSIM_TX_LPDT_LP;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_ESCMODE);
-}
-
-void exynos_mipi_dsi_enable_hs_clock(struct mipi_dsim_device *dsim,
-		unsigned int enable)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_CLKCTRL)) &
-		~(DSIM_TX_REQUEST_HSCLK_SHIFT(0x1));
-
-	reg |= DSIM_TX_REQUEST_HSCLK_SHIFT(enable);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_CLKCTRL);
-}
-
-void exynos_mipi_dsi_dp_dn_swap(struct mipi_dsim_device *dsim,
-		unsigned int swap_en)
-{
-	unsigned int reg = readl(dsim->reg_base + EXYNOS_DSIM_PHYACCHR1);
-
-	reg &= ~(0x3 << 0);
-	reg |= (swap_en & 0x3) << 0;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_PHYACCHR1);
-}
-
-void exynos_mipi_dsi_hs_zero_ctrl(struct mipi_dsim_device *dsim,
-		unsigned int hs_zero)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_PLLCTRL)) &
-		~(0xf << 28);
-
-	reg |= ((hs_zero & 0xf) << 28);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_PLLCTRL);
-}
-
-void exynos_mipi_dsi_prep_ctrl(struct mipi_dsim_device *dsim, unsigned int prep)
-{
-	unsigned int reg = (readl(dsim->reg_base + EXYNOS_DSIM_PLLCTRL)) &
-		~(0x7 << 20);
-
-	reg |= ((prep & 0x7) << 20);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_PLLCTRL);
-}
-
-unsigned int exynos_mipi_dsi_read_interrupt(struct mipi_dsim_device *dsim)
-{
-	return readl(dsim->reg_base + EXYNOS_DSIM_INTSRC);
-}
-
-void exynos_mipi_dsi_clear_interrupt(struct mipi_dsim_device *dsim,
-					unsigned int src)
-{
-	unsigned int reg = readl(dsim->reg_base + EXYNOS_DSIM_INTSRC);
-
-	reg |= src;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_INTSRC);
-}
-
-void exynos_mipi_dsi_set_interrupt(struct mipi_dsim_device *dsim,
-					unsigned int src, unsigned int enable)
-{
-	unsigned int reg = 0;
-
-	if (enable)
-		reg |= src;
-	else
-		reg &= ~src;
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_INTSRC);
-}
-
-unsigned int exynos_mipi_dsi_is_pll_stable(struct mipi_dsim_device *dsim)
-{
-	unsigned int reg;
-
-	reg = readl(dsim->reg_base + EXYNOS_DSIM_STATUS);
-
-	return reg & (1 << 31) ? 1 : 0;
-}
-
-unsigned int exynos_mipi_dsi_get_fifo_state(struct mipi_dsim_device *dsim)
-{
-	return readl(dsim->reg_base + EXYNOS_DSIM_FIFOCTRL) & ~(0x1f);
-}
-
-void exynos_mipi_dsi_wr_tx_header(struct mipi_dsim_device *dsim,
-	unsigned int di, unsigned int data0, unsigned int data1)
-{
-	unsigned int reg = (data1 << 16) | (data0 << 8) | ((di & 0x3f) << 0);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_PKTHDR);
-}
-
-void exynos_mipi_dsi_rd_tx_header(struct mipi_dsim_device *dsim,
-	unsigned int di, unsigned int data0)
-{
-	unsigned int reg = (data0 << 8) | (di << 0);
-
-	writel(reg, dsim->reg_base + EXYNOS_DSIM_PKTHDR);
-}
-
-unsigned int exynos_mipi_dsi_rd_rx_fifo(struct mipi_dsim_device *dsim)
-{
-	return readl(dsim->reg_base + EXYNOS_DSIM_RXFIFO);
-}
-
-unsigned int _exynos_mipi_dsi_get_frame_done_status(struct mipi_dsim_device *dsim)
-{
-	unsigned int reg = readl(dsim->reg_base + EXYNOS_DSIM_INTSRC);
-
-	return (reg & INTSRC_FRAME_DONE) ? 1 : 0;
-}
-
-void _exynos_mipi_dsi_clear_frame_done(struct mipi_dsim_device *dsim)
-{
-	unsigned int reg = readl(dsim->reg_base + EXYNOS_DSIM_INTSRC);
-
-	writel(reg | INTSRC_FRAME_DONE, dsim->reg_base +
-		EXYNOS_DSIM_INTSRC);
-}
-
-void exynos_mipi_dsi_wr_tx_data(struct mipi_dsim_device *dsim,
-		unsigned int tx_data)
-{
-	writel(tx_data, dsim->reg_base + EXYNOS_DSIM_PAYLOAD);
-}
diff --git a/drivers/video/fbdev/exynos/exynos_mipi_dsi_lowlevel.h b/drivers/video/fbdev/exynos/exynos_mipi_dsi_lowlevel.h
deleted file mode 100644
index 85460701c7ea..000000000000
--- a/drivers/video/fbdev/exynos/exynos_mipi_dsi_lowlevel.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* linux/drivers/video/exynos/exynos_mipi_dsi_lowlevel.h
- *
- * Header file for Samsung SoC MIPI-DSI lowlevel driver.
- *
- * Copyright (c) 2012 Samsung Electronics Co., Ltd
- *
- * InKi Dae <inki.dae@samsung.com>
- * Donghwa Lee <dh09.lee@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#ifndef _EXYNOS_MIPI_DSI_LOWLEVEL_H
-#define _EXYNOS_MIPI_DSI_LOWLEVEL_H
-
-void exynos_mipi_dsi_func_reset(struct mipi_dsim_device *dsim);
-void exynos_mipi_dsi_sw_reset(struct mipi_dsim_device *dsim);
-void exynos_mipi_dsi_sw_reset_release(struct mipi_dsim_device *dsim);
-int exynos_mipi_dsi_get_sw_reset_release(struct mipi_dsim_device *dsim);
-void exynos_mipi_dsi_set_interrupt_mask(struct mipi_dsim_device *dsim,
-	unsigned int mode, unsigned int mask);
-void exynos_mipi_dsi_set_data_lane_number(struct mipi_dsim_device *dsim,
-					unsigned int count);
-void exynos_mipi_dsi_init_fifo_pointer(struct mipi_dsim_device *dsim,
-					unsigned int cfg);
-void exynos_mipi_dsi_set_phy_tunning(struct mipi_dsim_device *dsim,
-				unsigned int value);
-void exynos_mipi_dsi_set_phy_tunning(struct mipi_dsim_device *dsim,
-				unsigned int value);
-void exynos_mipi_dsi_set_main_stand_by(struct mipi_dsim_device *dsim,
-		unsigned int enable);
-void exynos_mipi_dsi_set_main_disp_resol(struct mipi_dsim_device *dsim,
-		unsigned int width_resol, unsigned int height_resol);
-void exynos_mipi_dsi_set_main_disp_vporch(struct mipi_dsim_device *dsim,
-	unsigned int cmd_allow, unsigned int vfront, unsigned int vback);
-void exynos_mipi_dsi_set_main_disp_hporch(struct mipi_dsim_device *dsim,
-			unsigned int front, unsigned int back);
-void exynos_mipi_dsi_set_main_disp_sync_area(struct mipi_dsim_device *dsim,
-				unsigned int vert, unsigned int hori);
-void exynos_mipi_dsi_set_sub_disp_resol(struct mipi_dsim_device *dsim,
-				unsigned int vert, unsigned int hori);
-void exynos_mipi_dsi_init_config(struct mipi_dsim_device *dsim);
-void exynos_mipi_dsi_display_config(struct mipi_dsim_device *dsim,
-				struct mipi_dsim_config *dsim_config);
-void exynos_mipi_dsi_set_data_lane_number(struct mipi_dsim_device *dsim,
-				unsigned int count);
-void exynos_mipi_dsi_enable_lane(struct mipi_dsim_device *dsim, unsigned int lane,
-				unsigned int enable);
-void exynos_mipi_dsi_enable_afc(struct mipi_dsim_device *dsim, unsigned int enable,
-				unsigned int afc_code);
-void exynos_mipi_dsi_enable_pll_bypass(struct mipi_dsim_device *dsim,
-				unsigned int enable);
-void exynos_mipi_dsi_set_pll_pms(struct mipi_dsim_device *dsim, unsigned int p,
-				unsigned int m, unsigned int s);
-void exynos_mipi_dsi_pll_freq_band(struct mipi_dsim_device *dsim,
-				unsigned int freq_band);
-void exynos_mipi_dsi_pll_freq(struct mipi_dsim_device *dsim,
-			unsigned int pre_divider, unsigned int main_divider,
-			unsigned int scaler);
-void exynos_mipi_dsi_pll_stable_time(struct mipi_dsim_device *dsim,
-			unsigned int lock_time);
-void exynos_mipi_dsi_enable_pll(struct mipi_dsim_device *dsim,
-					unsigned int enable);
-void exynos_mipi_dsi_set_byte_clock_src(struct mipi_dsim_device *dsim,
-					unsigned int src);
-void exynos_mipi_dsi_enable_byte_clock(struct mipi_dsim_device *dsim,
-					unsigned int enable);
-void exynos_mipi_dsi_set_esc_clk_prs(struct mipi_dsim_device *dsim,
-				unsigned int enable, unsigned int prs_val);
-void exynos_mipi_dsi_enable_esc_clk_on_lane(struct mipi_dsim_device *dsim,
-				unsigned int lane_sel, unsigned int enable);
-void exynos_mipi_dsi_force_dphy_stop_state(struct mipi_dsim_device *dsim,
-				unsigned int enable);
-unsigned int exynos_mipi_dsi_is_lane_state(struct mipi_dsim_device *dsim);
-void exynos_mipi_dsi_set_stop_state_counter(struct mipi_dsim_device *dsim,
-				unsigned int cnt_val);
-void exynos_mipi_dsi_set_bta_timeout(struct mipi_dsim_device *dsim,
-				unsigned int timeout);
-void exynos_mipi_dsi_set_lpdr_timeout(struct mipi_dsim_device *dsim,
-				unsigned int timeout);
-void exynos_mipi_dsi_set_lcdc_transfer_mode(struct mipi_dsim_device *dsim,
-					unsigned int lp);
-void exynos_mipi_dsi_set_cpu_transfer_mode(struct mipi_dsim_device *dsim,
-					unsigned int lp);
-void exynos_mipi_dsi_enable_hs_clock(struct mipi_dsim_device *dsim,
-				unsigned int enable);
-void exynos_mipi_dsi_dp_dn_swap(struct mipi_dsim_device *dsim,
-				unsigned int swap_en);
-void exynos_mipi_dsi_hs_zero_ctrl(struct mipi_dsim_device *dsim,
-				unsigned int hs_zero);
-void exynos_mipi_dsi_prep_ctrl(struct mipi_dsim_device *dsim, unsigned int prep);
-unsigned int exynos_mipi_dsi_read_interrupt(struct mipi_dsim_device *dsim);
-unsigned int exynos_mipi_dsi_read_interrupt_mask(struct mipi_dsim_device *dsim);
-void exynos_mipi_dsi_clear_interrupt(struct mipi_dsim_device *dsim,
-					unsigned int src);
-void exynos_mipi_dsi_set_interrupt(struct mipi_dsim_device *dsim,
-					unsigned int src, unsigned int enable);
-unsigned int exynos_mipi_dsi_is_pll_stable(struct mipi_dsim_device *dsim);
-unsigned int exynos_mipi_dsi_get_fifo_state(struct mipi_dsim_device *dsim);
-unsigned int _exynos_mipi_dsi_get_frame_done_status(struct mipi_dsim_device *dsim);
-void _exynos_mipi_dsi_clear_frame_done(struct mipi_dsim_device *dsim);
-void exynos_mipi_dsi_wr_tx_header(struct mipi_dsim_device *dsim, unsigned int di,
-				unsigned int data0, unsigned int data1);
-void exynos_mipi_dsi_wr_tx_data(struct mipi_dsim_device *dsim,
-		unsigned int tx_data);
-void exynos_mipi_dsi_rd_tx_header(struct mipi_dsim_device *dsim,
-		unsigned int data0, unsigned int data1);
-unsigned int exynos_mipi_dsi_rd_rx_fifo(struct mipi_dsim_device *dsim);
-
-#endif /* _EXYNOS_MIPI_DSI_LOWLEVEL_H */
diff --git a/drivers/video/fbdev/exynos/exynos_mipi_dsi_regs.h b/drivers/video/fbdev/exynos/exynos_mipi_dsi_regs.h
deleted file mode 100644
index 4227106d3fd0..000000000000
--- a/drivers/video/fbdev/exynos/exynos_mipi_dsi_regs.h
+++ /dev/null
@@ -1,149 +0,0 @@
-/* linux/driver/video/exynos/exynos_mipi_dsi_regs.h
- *
- * Register definition file for Samsung MIPI-DSIM driver
- *
- * Copyright (c) 2012 Samsung Electronics Co., Ltd
- *
- * InKi Dae <inki.dae@samsung.com>
- * Donghwa Lee <dh09.lee@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#ifndef _EXYNOS_MIPI_DSI_REGS_H
-#define _EXYNOS_MIPI_DSI_REGS_H
-
-#define EXYNOS_DSIM_STATUS		0x0	/* Status register */
-#define EXYNOS_DSIM_SWRST		0x4	/* Software reset register */
-#define EXYNOS_DSIM_CLKCTRL		0x8	/* Clock control register */
-#define EXYNOS_DSIM_TIMEOUT		0xc	/* Time out register */
-#define EXYNOS_DSIM_CONFIG		0x10	/* Configuration register */
-#define EXYNOS_DSIM_ESCMODE		0x14	/* Escape mode register */
-
-/* Main display image resolution register */
-#define EXYNOS_DSIM_MDRESOL		0x18
-#define EXYNOS_DSIM_MVPORCH		0x1c	/* Main display Vporch register */
-#define EXYNOS_DSIM_MHPORCH		0x20	/* Main display Hporch register */
-#define EXYNOS_DSIM_MSYNC		0x24	/* Main display sync area register */
-
-/* Sub display image resolution register */
-#define EXYNOS_DSIM_SDRESOL		0x28
-#define EXYNOS_DSIM_INTSRC		0x2c	/* Interrupt source register */
-#define EXYNOS_DSIM_INTMSK		0x30	/* Interrupt mask register */
-#define EXYNOS_DSIM_PKTHDR		0x34	/* Packet Header FIFO register */
-#define EXYNOS_DSIM_PAYLOAD		0x38	/* Payload FIFO register */
-#define EXYNOS_DSIM_RXFIFO		0x3c	/* Read FIFO register */
-#define EXYNOS_DSIM_FIFOTHLD		0x40	/* FIFO threshold level register */
-#define EXYNOS_DSIM_FIFOCTRL		0x44	/* FIFO status and control register */
-
-/* FIFO memory AC characteristic register */
-#define EXYNOS_DSIM_PLLCTRL		0x4c	/* PLL control register */
-#define EXYNOS_DSIM_PLLTMR		0x50	/* PLL timer register */
-#define EXYNOS_DSIM_PHYACCHR		0x54	/* D-PHY AC characteristic register */
-#define EXYNOS_DSIM_PHYACCHR1		0x58	/* D-PHY AC characteristic register1 */
-
-/* DSIM_STATUS */
-#define DSIM_STOP_STATE_DAT(x)		(((x) & 0xf) << 0)
-#define DSIM_STOP_STATE_CLK		(1 << 8)
-#define DSIM_TX_READY_HS_CLK		(1 << 10)
-
-/* DSIM_SWRST */
-#define DSIM_FUNCRST			(1 << 16)
-#define DSIM_SWRST			(1 << 0)
-
-/* EXYNOS_DSIM_TIMEOUT */
-#define DSIM_LPDR_TOUT_SHIFT(x)		((x) << 0)
-#define DSIM_BTA_TOUT_SHIFT(x)		((x) << 16)
-
-/* EXYNOS_DSIM_CLKCTRL */
-#define DSIM_LANE_ESC_CLKEN(x)		(((x) & 0x1f) << 19)
-#define DSIM_BYTE_CLKEN_SHIFT(x)	((x) << 24)
-#define DSIM_BYTE_CLK_SRC_SHIFT(x)	((x) <<	25)
-#define DSIM_PLL_BYPASS_SHIFT(x)	((x) <<	27)
-#define DSIM_ESC_CLKEN_SHIFT(x)		((x) << 28)
-#define DSIM_TX_REQUEST_HSCLK_SHIFT(x)	((x) << 31)
-
-/* EXYNOS_DSIM_CONFIG */
-#define DSIM_LANE_ENx(x)		(((x) & 0x1f) << 0)
-#define DSIM_NUM_OF_DATALANE_SHIFT(x)	((x) << 5)
-#define DSIM_HSA_MODE_SHIFT(x)		((x) << 20)
-#define DSIM_HBP_MODE_SHIFT(x)		((x) << 21)
-#define DSIM_HFP_MODE_SHIFT(x)		((x) << 22)
-#define DSIM_HSE_MODE_SHIFT(x)		((x) << 23)
-#define DSIM_AUTO_MODE_SHIFT(x)		((x) << 24)
-#define DSIM_EOT_DISABLE(x)		((x) << 28)
-#define DSIM_AUTO_FLUSH(x)		((x) << 29)
-
-#define DSIM_NUM_OF_DATA_LANE(x)	((x) << DSIM_NUM_OF_DATALANE_SHIFT)
-
-/* EXYNOS_DSIM_ESCMODE */
-#define DSIM_TX_LPDT_LP			(1 << 6)
-#define DSIM_CMD_LPDT_LP		(1 << 7)
-#define DSIM_FORCE_STOP_STATE_SHIFT(x)	((x) << 20)
-#define DSIM_STOP_STATE_CNT_SHIFT(x)	((x) << 21)
-
-/* EXYNOS_DSIM_MDRESOL */
-#define DSIM_MAIN_STAND_BY		(1 << 31)
-#define DSIM_MAIN_VRESOL(x)		(((x) & 0x7ff) << 16)
-#define DSIM_MAIN_HRESOL(x)		(((x) & 0X7ff) << 0)
-
-/* EXYNOS_DSIM_MVPORCH */
-#define DSIM_CMD_ALLOW_SHIFT(x)		((x) << 28)
-#define DSIM_STABLE_VFP_SHIFT(x)	((x) << 16)
-#define DSIM_MAIN_VBP_SHIFT(x)		((x) << 0)
-#define DSIM_CMD_ALLOW_MASK		(0xf << 28)
-#define DSIM_STABLE_VFP_MASK		(0x7ff << 16)
-#define DSIM_MAIN_VBP_MASK		(0x7ff << 0)
-
-/* EXYNOS_DSIM_MHPORCH */
-#define DSIM_MAIN_HFP_SHIFT(x)		((x) << 16)
-#define DSIM_MAIN_HBP_SHIFT(x)		((x) << 0)
-#define DSIM_MAIN_HFP_MASK		((0xffff) << 16)
-#define DSIM_MAIN_HBP_MASK		((0xffff) << 0)
-
-/* EXYNOS_DSIM_MSYNC */
-#define DSIM_MAIN_VSA_SHIFT(x)		((x) << 22)
-#define DSIM_MAIN_HSA_SHIFT(x)		((x) << 0)
-#define DSIM_MAIN_VSA_MASK		((0x3ff) << 22)
-#define DSIM_MAIN_HSA_MASK		((0xffff) << 0)
-
-/* EXYNOS_DSIM_SDRESOL */
-#define DSIM_SUB_STANDY_SHIFT(x)	((x) << 31)
-#define DSIM_SUB_VRESOL_SHIFT(x)	((x) << 16)
-#define DSIM_SUB_HRESOL_SHIFT(x)	((x) << 0)
-#define DSIM_SUB_STANDY_MASK		((0x1) << 31)
-#define DSIM_SUB_VRESOL_MASK		((0x7ff) << 16)
-#define DSIM_SUB_HRESOL_MASK		((0x7ff) << 0)
-
-/* EXYNOS_DSIM_INTSRC */
-#define INTSRC_PLL_STABLE		(1 << 31)
-#define INTSRC_SW_RST_RELEASE		(1 << 30)
-#define INTSRC_SFR_FIFO_EMPTY		(1 << 29)
-#define INTSRC_FRAME_DONE		(1 << 24)
-#define INTSRC_RX_DATA_DONE		(1 << 18)
-
-/* EXYNOS_DSIM_INTMSK */
-#define INTMSK_FIFO_EMPTY		(1 << 29)
-#define INTMSK_BTA			(1 << 25)
-#define INTMSK_FRAME_DONE		(1 << 24)
-#define INTMSK_RX_TIMEOUT		(1 << 21)
-#define INTMSK_BTA_TIMEOUT		(1 << 20)
-#define INTMSK_RX_DONE			(1 << 18)
-#define INTMSK_RX_TE			(1 << 17)
-#define INTMSK_RX_ACK			(1 << 16)
-#define INTMSK_RX_ECC_ERR		(1 << 15)
-#define INTMSK_RX_CRC_ERR		(1 << 14)
-
-/* EXYNOS_DSIM_FIFOCTRL */
-#define SFR_HEADER_EMPTY		(1 << 22)
-
-/* EXYNOS_DSIM_PHYACCHR */
-#define DSIM_AFC_CTL(x)			(((x) & 0x7) << 5)
-
-/* EXYNOS_DSIM_PLLCTRL */
-#define DSIM_PLL_EN_SHIFT(x)		((x) << 23)
-#define DSIM_FREQ_BAND_SHIFT(x)		((x) << 24)
-
-#endif /* _EXYNOS_MIPI_DSI_REGS_H */
diff --git a/drivers/video/fbdev/exynos/s6e8ax0.c b/drivers/video/fbdev/exynos/s6e8ax0.c
deleted file mode 100644
index de2f3e793786..000000000000
--- a/drivers/video/fbdev/exynos/s6e8ax0.c
+++ /dev/null
@@ -1,887 +0,0 @@
-/* linux/drivers/video/exynos/s6e8ax0.c
- *
- * MIPI-DSI based s6e8ax0 AMOLED lcd 4.65 inch panel driver.
- *
- * Inki Dae, <inki.dae@samsung.com>
- * Donghwa Lee, <dh09.lee@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#include <linux/module.h>
-#include <linux/kernel.h>
-#include <linux/errno.h>
-#include <linux/mutex.h>
-#include <linux/wait.h>
-#include <linux/ctype.h>
-#include <linux/io.h>
-#include <linux/delay.h>
-#include <linux/irq.h>
-#include <linux/interrupt.h>
-#include <linux/lcd.h>
-#include <linux/fb.h>
-#include <linux/backlight.h>
-#include <linux/regulator/consumer.h>
-
-#include <video/mipi_display.h>
-#include <video/exynos_mipi_dsim.h>
-
-#define LDI_MTP_LENGTH		24
-#define DSIM_PM_STABLE_TIME	10
-#define MIN_BRIGHTNESS		0
-#define MAX_BRIGHTNESS		24
-#define GAMMA_TABLE_COUNT	26
-
-#define POWER_IS_ON(pwr)	((pwr) == FB_BLANK_UNBLANK)
-#define POWER_IS_OFF(pwr)	((pwr) == FB_BLANK_POWERDOWN)
-#define POWER_IS_NRM(pwr)	((pwr) == FB_BLANK_NORMAL)
-
-#define lcd_to_master(a)	(a->dsim_dev->master)
-#define lcd_to_master_ops(a)	((lcd_to_master(a))->master_ops)
-
-enum {
-	DSIM_NONE_STATE = 0,
-	DSIM_RESUME_COMPLETE = 1,
-	DSIM_FRAME_DONE = 2,
-};
-
-struct s6e8ax0 {
-	struct device	*dev;
-	unsigned int			power;
-	unsigned int			id;
-	unsigned int			gamma;
-	unsigned int			acl_enable;
-	unsigned int			cur_acl;
-
-	struct lcd_device	*ld;
-	struct backlight_device	*bd;
-
-	struct mipi_dsim_lcd_device	*dsim_dev;
-	struct lcd_platform_data	*ddi_pd;
-	struct mutex			lock;
-	bool  enabled;
-};
-
-
-static struct regulator_bulk_data supplies[] = {
-	{ .supply = "vdd3", },
-	{ .supply = "vci", },
-};
-
-static void s6e8ax0_regulator_enable(struct s6e8ax0 *lcd)
-{
-	int ret = 0;
-	struct lcd_platform_data *pd = NULL;
-
-	pd = lcd->ddi_pd;
-	mutex_lock(&lcd->lock);
-	if (!lcd->enabled) {
-		ret = regulator_bulk_enable(ARRAY_SIZE(supplies), supplies);
-		if (ret)
-			goto out;
-
-		lcd->enabled = true;
-	}
-	msleep(pd->power_on_delay);
-out:
-	mutex_unlock(&lcd->lock);
-}
-
-static void s6e8ax0_regulator_disable(struct s6e8ax0 *lcd)
-{
-	int ret = 0;
-
-	mutex_lock(&lcd->lock);
-	if (lcd->enabled) {
-		ret = regulator_bulk_disable(ARRAY_SIZE(supplies), supplies);
-		if (ret)
-			goto out;
-
-		lcd->enabled = false;
-	}
-out:
-	mutex_unlock(&lcd->lock);
-}
-
-static const unsigned char s6e8ax0_22_gamma_30[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xf5, 0x00, 0xff, 0xad, 0xaf,
-	0xbA, 0xc3, 0xd8, 0xc5, 0x9f, 0xc6, 0x9e, 0xc1, 0xdc, 0xc0,
-	0x00, 0x61, 0x00, 0x5a, 0x00, 0x74,
-};
-
-static const unsigned char s6e8ax0_22_gamma_50[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xe8, 0x1f, 0xf7, 0xad, 0xc0,
-	0xb5, 0xc4, 0xdc, 0xc4, 0x9e, 0xc6, 0x9c, 0xbb, 0xd8, 0xbb,
-	0x00, 0x70, 0x00, 0x68, 0x00, 0x86,
-};
-
-static const unsigned char s6e8ax0_22_gamma_60[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xde, 0x1f, 0xef, 0xad, 0xc4,
-	0xb3, 0xc3, 0xdd, 0xc4, 0x9e, 0xc6, 0x9c, 0xbc, 0xd6, 0xba,
-	0x00, 0x75, 0x00, 0x6e, 0x00, 0x8d,
-};
-
-static const unsigned char s6e8ax0_22_gamma_70[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xd8, 0x1f, 0xe7, 0xaf, 0xc8,
-	0xb4, 0xc4, 0xdd, 0xc3, 0x9d, 0xc6, 0x9c, 0xbb, 0xd6, 0xb9,
-	0x00, 0x7a, 0x00, 0x72, 0x00, 0x93,
-};
-
-static const unsigned char s6e8ax0_22_gamma_80[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xc9, 0x1f, 0xde, 0xae, 0xc9,
-	0xb1, 0xc3, 0xdd, 0xc2, 0x9d, 0xc5, 0x9b, 0xbc, 0xd6, 0xbb,
-	0x00, 0x7f, 0x00, 0x77, 0x00, 0x99,
-};
-
-static const unsigned char s6e8ax0_22_gamma_90[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xc7, 0x1f, 0xd9, 0xb0, 0xcc,
-	0xb2, 0xc3, 0xdc, 0xc1, 0x9c, 0xc6, 0x9c, 0xbc, 0xd4, 0xb9,
-	0x00, 0x83, 0x00, 0x7b, 0x00, 0x9e,
-};
-
-static const unsigned char s6e8ax0_22_gamma_100[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xbd, 0x80, 0xcd, 0xba, 0xce,
-	0xb3, 0xc4, 0xde, 0xc3, 0x9c, 0xc4, 0x9, 0xb8, 0xd3, 0xb6,
-	0x00, 0x88, 0x00, 0x80, 0x00, 0xa5,
-};
-
-static const unsigned char s6e8ax0_22_gamma_120[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb9, 0x95, 0xc8, 0xb1, 0xcf,
-	0xb2, 0xc6, 0xdf, 0xc5, 0x9b, 0xc3, 0x99, 0xb6, 0xd2, 0xb6,
-	0x00, 0x8f, 0x00, 0x86, 0x00, 0xac,
-};
-
-static const unsigned char s6e8ax0_22_gamma_130[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb7, 0xa0, 0xc7, 0xb1, 0xd0,
-	0xb2, 0xc4, 0xdd, 0xc3, 0x9a, 0xc3, 0x98, 0xb6, 0xd0, 0xb4,
-	0x00, 0x92, 0x00, 0x8a, 0x00, 0xb1,
-};
-
-static const unsigned char s6e8ax0_22_gamma_140[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb7, 0xa0, 0xc5, 0xb2, 0xd0,
-	0xb3, 0xc3, 0xde, 0xc3, 0x9b, 0xc2, 0x98, 0xb6, 0xd0, 0xb4,
-	0x00, 0x95, 0x00, 0x8d, 0x00, 0xb5,
-};
-
-static const unsigned char s6e8ax0_22_gamma_150[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb3, 0xa0, 0xc2, 0xb2, 0xd0,
-	0xb2, 0xc1, 0xdd, 0xc2, 0x9b, 0xc2, 0x98, 0xb4, 0xcf, 0xb1,
-	0x00, 0x99, 0x00, 0x90, 0x00, 0xba,
-};
-
-static const unsigned char s6e8ax0_22_gamma_160[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xaf, 0xa5, 0xbf, 0xb0, 0xd0,
-	0xb1, 0xc3, 0xde, 0xc2, 0x99, 0xc1, 0x97, 0xb4, 0xce, 0xb1,
-	0x00, 0x9c, 0x00, 0x93, 0x00, 0xbe,
-};
-
-static const unsigned char s6e8ax0_22_gamma_170[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xaf, 0xb5, 0xbf, 0xb1, 0xd1,
-	0xb1, 0xc3, 0xde, 0xc3, 0x99, 0xc0, 0x96, 0xb4, 0xce, 0xb1,
-	0x00, 0x9f, 0x00, 0x96, 0x00, 0xc2,
-};
-
-static const unsigned char s6e8ax0_22_gamma_180[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xaf, 0xb7, 0xbe, 0xb3, 0xd2,
-	0xb3, 0xc3, 0xde, 0xc2, 0x97, 0xbf, 0x95, 0xb4, 0xcd, 0xb1,
-	0x00, 0xa2, 0x00, 0x99, 0x00, 0xc5,
-};
-
-static const unsigned char s6e8ax0_22_gamma_190[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xaf, 0xb9, 0xbe, 0xb2, 0xd2,
-	0xb2, 0xc3, 0xdd, 0xc3, 0x98, 0xbf, 0x95, 0xb2, 0xcc, 0xaf,
-	0x00, 0xa5, 0x00, 0x9c, 0x00, 0xc9,
-};
-
-static const unsigned char s6e8ax0_22_gamma_200[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xaf, 0xb9, 0xbc, 0xb2, 0xd2,
-	0xb1, 0xc4, 0xdd, 0xc3, 0x97, 0xbe, 0x95, 0xb1, 0xcb, 0xae,
-	0x00, 0xa8, 0x00, 0x9f, 0x00, 0xcd,
-};
-
-static const unsigned char s6e8ax0_22_gamma_210[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb1, 0xc1, 0xbd, 0xb1, 0xd1,
-	0xb1, 0xc2, 0xde, 0xc2, 0x97, 0xbe, 0x94, 0xB0, 0xc9, 0xad,
-	0x00, 0xae, 0x00, 0xa4, 0x00, 0xd4,
-};
-
-static const unsigned char s6e8ax0_22_gamma_220[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb1, 0xc7, 0xbd, 0xb1, 0xd1,
-	0xb1, 0xc2, 0xdd, 0xc2, 0x97, 0xbd, 0x94, 0xb0, 0xc9, 0xad,
-	0x00, 0xad, 0x00, 0xa2, 0x00, 0xd3,
-};
-
-static const unsigned char s6e8ax0_22_gamma_230[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb1, 0xc3, 0xbd, 0xb2, 0xd1,
-	0xb1, 0xc3, 0xdd, 0xc1, 0x96, 0xbd, 0x94, 0xb0, 0xc9, 0xad,
-	0x00, 0xb0, 0x00, 0xa7, 0x00, 0xd7,
-};
-
-static const unsigned char s6e8ax0_22_gamma_240[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb1, 0xcb, 0xbd, 0xb1, 0xd2,
-	0xb1, 0xc3, 0xdD, 0xc2, 0x95, 0xbd, 0x93, 0xaf, 0xc8, 0xab,
-	0x00, 0xb3, 0x00, 0xa9, 0x00, 0xdb,
-};
-
-static const unsigned char s6e8ax0_22_gamma_250[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb3, 0xcc, 0xbe, 0xb0, 0xd2,
-	0xb0, 0xc3, 0xdD, 0xc2, 0x94, 0xbc, 0x92, 0xae, 0xc8, 0xab,
-	0x00, 0xb6, 0x00, 0xab, 0x00, 0xde,
-};
-
-static const unsigned char s6e8ax0_22_gamma_260[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb3, 0xd0, 0xbe, 0xaf, 0xd1,
-	0xaf, 0xc2, 0xdd, 0xc1, 0x96, 0xbc, 0x93, 0xaf, 0xc8, 0xac,
-	0x00, 0xb7, 0x00, 0xad, 0x00, 0xe0,
-};
-
-static const unsigned char s6e8ax0_22_gamma_270[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb2, 0xcF, 0xbd, 0xb0, 0xd2,
-	0xaf, 0xc2, 0xdc, 0xc1, 0x95, 0xbd, 0x93, 0xae, 0xc6, 0xaa,
-	0x00, 0xba, 0x00, 0xb0, 0x00, 0xe4,
-};
-
-static const unsigned char s6e8ax0_22_gamma_280[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb2, 0xd0, 0xbd, 0xaf, 0xd0,
-	0xad, 0xc4, 0xdd, 0xc3, 0x95, 0xbd, 0x93, 0xac, 0xc5, 0xa9,
-	0x00, 0xbd, 0x00, 0xb2, 0x00, 0xe7,
-};
-
-static const unsigned char s6e8ax0_22_gamma_300[] = {
-	0xfa, 0x01, 0x60, 0x10, 0x60, 0xb5, 0xd3, 0xbd, 0xb1, 0xd2,
-	0xb0, 0xc0, 0xdc, 0xc0, 0x94, 0xba, 0x91, 0xac, 0xc5, 0xa9,
-	0x00, 0xc2, 0x00, 0xb7, 0x00, 0xed,
-};
-
-static const unsigned char *s6e8ax0_22_gamma_table[] = {
-	s6e8ax0_22_gamma_30,
-	s6e8ax0_22_gamma_50,
-	s6e8ax0_22_gamma_60,
-	s6e8ax0_22_gamma_70,
-	s6e8ax0_22_gamma_80,
-	s6e8ax0_22_gamma_90,
-	s6e8ax0_22_gamma_100,
-	s6e8ax0_22_gamma_120,
-	s6e8ax0_22_gamma_130,
-	s6e8ax0_22_gamma_140,
-	s6e8ax0_22_gamma_150,
-	s6e8ax0_22_gamma_160,
-	s6e8ax0_22_gamma_170,
-	s6e8ax0_22_gamma_180,
-	s6e8ax0_22_gamma_190,
-	s6e8ax0_22_gamma_200,
-	s6e8ax0_22_gamma_210,
-	s6e8ax0_22_gamma_220,
-	s6e8ax0_22_gamma_230,
-	s6e8ax0_22_gamma_240,
-	s6e8ax0_22_gamma_250,
-	s6e8ax0_22_gamma_260,
-	s6e8ax0_22_gamma_270,
-	s6e8ax0_22_gamma_280,
-	s6e8ax0_22_gamma_300,
-};
-
-static void s6e8ax0_panel_cond(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-
-	static const unsigned char data_to_send[] = {
-		0xf8, 0x3d, 0x35, 0x00, 0x00, 0x00, 0x93, 0x00, 0x3c, 0x7d,
-		0x08, 0x27, 0x7d, 0x3f, 0x00, 0x00, 0x00, 0x20, 0x04, 0x08,
-		0x6e, 0x00, 0x00, 0x00, 0x02, 0x08, 0x08, 0x23, 0x23, 0xc0,
-		0xc8, 0x08, 0x48, 0xc1, 0x00, 0xc1, 0xff, 0xff, 0xc8
-	};
-	static const unsigned char data_to_send_panel_reverse[] = {
-		0xf8, 0x19, 0x35, 0x00, 0x00, 0x00, 0x93, 0x00, 0x3c, 0x7d,
-		0x08, 0x27, 0x7d, 0x3f, 0x00, 0x00, 0x00, 0x20, 0x04, 0x08,
-		0x6e, 0x00, 0x00, 0x00, 0x02, 0x08, 0x08, 0x23, 0x23, 0xc0,
-		0xc1, 0x01, 0x41, 0xc1, 0x00, 0xc1, 0xf6, 0xf6, 0xc1
-	};
-
-	if (lcd->dsim_dev->panel_reverse)
-		ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-				data_to_send_panel_reverse,
-				ARRAY_SIZE(data_to_send_panel_reverse));
-	else
-		ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-				data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_display_cond(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xf2, 0x80, 0x03, 0x0d
-	};
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-/* Gamma 2.2 Setting (200cd, 7500K, 10MPCD) */
-static void s6e8ax0_gamma_cond(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	unsigned int gamma = lcd->bd->props.brightness;
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-			s6e8ax0_22_gamma_table[gamma],
-			GAMMA_TABLE_COUNT);
-}
-
-static void s6e8ax0_gamma_update(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xf7, 0x03
-	};
-
-	ops->cmd_write(lcd_to_master(lcd),
-		MIPI_DSI_DCS_SHORT_WRITE_PARAM, data_to_send,
-		ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_etc_cond1(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xd1, 0xfe, 0x80, 0x00, 0x01, 0x0b, 0x00, 0x00, 0x40,
-		0x0d, 0x00, 0x00
-	};
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_etc_cond2(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xb6, 0x0c, 0x02, 0x03, 0x32, 0xff, 0x44, 0x44, 0xc0,
-		0x00
-	};
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_etc_cond3(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xe1, 0x10, 0x1c, 0x17, 0x08, 0x1d
-	};
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_etc_cond4(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xe2, 0xed, 0x07, 0xc3, 0x13, 0x0d, 0x03
-	};
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_etc_cond5(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xf4, 0xcf, 0x0a, 0x12, 0x10, 0x19, 0x33, 0x02
-	};
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-static void s6e8ax0_etc_cond6(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xe3, 0x40
-	};
-
-	ops->cmd_write(lcd_to_master(lcd),
-		MIPI_DSI_DCS_SHORT_WRITE_PARAM,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_etc_cond7(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xe4, 0x00, 0x00, 0x14, 0x80, 0x00, 0x00, 0x00
-	};
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_elvss_set(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xb1, 0x04, 0x00
-	};
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_elvss_nvm_set(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xd9, 0x5c, 0x20, 0x0c, 0x0f, 0x41, 0x00, 0x10, 0x11,
-		0x12, 0xd1, 0x00, 0x00, 0x00, 0x00, 0x80, 0xcb, 0xed,
-		0x64, 0xaf
-	};
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_sleep_in(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0x10, 0x00
-	};
-
-	ops->cmd_write(lcd_to_master(lcd),
-		MIPI_DSI_DCS_SHORT_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_sleep_out(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0x11, 0x00
-	};
-
-	ops->cmd_write(lcd_to_master(lcd),
-		MIPI_DSI_DCS_SHORT_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_display_on(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0x29, 0x00
-	};
-
-	ops->cmd_write(lcd_to_master(lcd),
-		MIPI_DSI_DCS_SHORT_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_display_off(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0x28, 0x00
-	};
-
-	ops->cmd_write(lcd_to_master(lcd),
-		MIPI_DSI_DCS_SHORT_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_apply_level2_key(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xf0, 0x5a, 0x5a
-	};
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_acl_on(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xc0, 0x01
-	};
-
-	ops->cmd_write(lcd_to_master(lcd),
-		MIPI_DSI_DCS_SHORT_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-static void s6e8ax0_acl_off(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	static const unsigned char data_to_send[] = {
-		0xc0, 0x00
-	};
-
-	ops->cmd_write(lcd_to_master(lcd),
-		MIPI_DSI_DCS_SHORT_WRITE,
-		data_to_send, ARRAY_SIZE(data_to_send));
-}
-
-/* Full white 50% reducing setting */
-static void s6e8ax0_acl_ctrl_set(struct s6e8ax0 *lcd)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	/* Full white 50% reducing setting */
-	static const unsigned char cutoff_50[] = {
-		0xc1, 0x47, 0x53, 0x13, 0x53, 0x00, 0x00, 0x02, 0xcf,
-		0x00, 0x00, 0x04, 0xff,	0x00, 0x00, 0x00, 0x00, 0x00,
-		0x01, 0x08, 0x0f, 0x16, 0x1d, 0x24, 0x2a, 0x31, 0x38,
-		0x3f, 0x46
-	};
-	/* Full white 45% reducing setting */
-	static const unsigned char cutoff_45[] = {
-		0xc1, 0x47, 0x53, 0x13, 0x53, 0x00, 0x00, 0x02, 0xcf,
-		0x00, 0x00, 0x04, 0xff,	0x00, 0x00, 0x00, 0x00, 0x00,
-		0x01, 0x07, 0x0d, 0x13, 0x19, 0x1f, 0x25, 0x2b, 0x31,
-		0x37, 0x3d
-	};
-	/* Full white 40% reducing setting */
-	static const unsigned char cutoff_40[] = {
-		0xc1, 0x47, 0x53, 0x13, 0x53, 0x00, 0x00, 0x02, 0xcf,
-		0x00, 0x00, 0x04, 0xff,	0x00, 0x00, 0x00, 0x00, 0x00,
-		0x01, 0x06, 0x0c, 0x11, 0x16, 0x1c, 0x21, 0x26, 0x2b,
-		0x31, 0x36
-	};
-
-	if (lcd->acl_enable) {
-		if (lcd->cur_acl == 0) {
-			if (lcd->gamma == 0 || lcd->gamma == 1) {
-				s6e8ax0_acl_off(lcd);
-				dev_dbg(&lcd->ld->dev,
-					"cur_acl=%d\n", lcd->cur_acl);
-			} else
-				s6e8ax0_acl_on(lcd);
-		}
-		switch (lcd->gamma) {
-		case 0: /* 30cd */
-			s6e8ax0_acl_off(lcd);
-			lcd->cur_acl = 0;
-			break;
-		case 1 ... 3: /* 50cd ~ 90cd */
-			ops->cmd_write(lcd_to_master(lcd),
-				MIPI_DSI_DCS_LONG_WRITE,
-				cutoff_40,
-				ARRAY_SIZE(cutoff_40));
-			lcd->cur_acl = 40;
-			break;
-		case 4 ... 7: /* 120cd ~ 210cd */
-			ops->cmd_write(lcd_to_master(lcd),
-				MIPI_DSI_DCS_LONG_WRITE,
-				cutoff_45,
-				ARRAY_SIZE(cutoff_45));
-			lcd->cur_acl = 45;
-			break;
-		case 8 ... 10: /* 220cd ~ 300cd */
-			ops->cmd_write(lcd_to_master(lcd),
-				MIPI_DSI_DCS_LONG_WRITE,
-				cutoff_50,
-				ARRAY_SIZE(cutoff_50));
-			lcd->cur_acl = 50;
-			break;
-		default:
-			break;
-		}
-	} else {
-		s6e8ax0_acl_off(lcd);
-		lcd->cur_acl = 0;
-		dev_dbg(&lcd->ld->dev, "cur_acl = %d\n", lcd->cur_acl);
-	}
-}
-
-static void s6e8ax0_read_id(struct s6e8ax0 *lcd, u8 *mtp_id)
-{
-	unsigned int ret;
-	unsigned int addr = 0xd1;	/* MTP ID */
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-
-	ret = ops->cmd_read(lcd_to_master(lcd),
-			MIPI_DSI_GENERIC_READ_REQUEST_1_PARAM,
-			addr, 3, mtp_id);
-}
-
-static int s6e8ax0_panel_init(struct s6e8ax0 *lcd)
-{
-	s6e8ax0_apply_level2_key(lcd);
-	s6e8ax0_sleep_out(lcd);
-	msleep(1);
-	s6e8ax0_panel_cond(lcd);
-	s6e8ax0_display_cond(lcd);
-	s6e8ax0_gamma_cond(lcd);
-	s6e8ax0_gamma_update(lcd);
-
-	s6e8ax0_etc_cond1(lcd);
-	s6e8ax0_etc_cond2(lcd);
-	s6e8ax0_etc_cond3(lcd);
-	s6e8ax0_etc_cond4(lcd);
-	s6e8ax0_etc_cond5(lcd);
-	s6e8ax0_etc_cond6(lcd);
-	s6e8ax0_etc_cond7(lcd);
-
-	s6e8ax0_elvss_nvm_set(lcd);
-	s6e8ax0_elvss_set(lcd);
-
-	s6e8ax0_acl_ctrl_set(lcd);
-	s6e8ax0_acl_on(lcd);
-
-	/* if ID3 value is not 33h, branch private elvss mode */
-	msleep(lcd->ddi_pd->power_on_delay);
-
-	return 0;
-}
-
-static int s6e8ax0_update_gamma_ctrl(struct s6e8ax0 *lcd, int brightness)
-{
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-
-	ops->cmd_write(lcd_to_master(lcd), MIPI_DSI_DCS_LONG_WRITE,
-			s6e8ax0_22_gamma_table[brightness],
-			ARRAY_SIZE(s6e8ax0_22_gamma_table));
-
-	/* update gamma table. */
-	s6e8ax0_gamma_update(lcd);
-	lcd->gamma = brightness;
-
-	return 0;
-}
-
-static int s6e8ax0_gamma_ctrl(struct s6e8ax0 *lcd, int gamma)
-{
-	s6e8ax0_update_gamma_ctrl(lcd, gamma);
-
-	return 0;
-}
-
-static int s6e8ax0_set_power(struct lcd_device *ld, int power)
-{
-	struct s6e8ax0 *lcd = lcd_get_data(ld);
-	struct mipi_dsim_master_ops *ops = lcd_to_master_ops(lcd);
-	int ret = 0;
-
-	if (power != FB_BLANK_UNBLANK && power != FB_BLANK_POWERDOWN &&
-			power != FB_BLANK_NORMAL) {
-		dev_err(lcd->dev, "power value should be 0, 1 or 4.\n");
-		return -EINVAL;
-	}
-
-	if ((power == FB_BLANK_UNBLANK) && ops->set_blank_mode) {
-		/* LCD power on */
-		if ((POWER_IS_ON(power) && POWER_IS_OFF(lcd->power))
-			|| (POWER_IS_ON(power) && POWER_IS_NRM(lcd->power))) {
-			ret = ops->set_blank_mode(lcd_to_master(lcd), power);
-			if (!ret && lcd->power != power)
-				lcd->power = power;
-		}
-	} else if ((power == FB_BLANK_POWERDOWN) && ops->set_early_blank_mode) {
-		/* LCD power off */
-		if ((POWER_IS_OFF(power) && POWER_IS_ON(lcd->power)) ||
-		(POWER_IS_ON(lcd->power) && POWER_IS_NRM(power))) {
-			ret = ops->set_early_blank_mode(lcd_to_master(lcd),
-							power);
-			if (!ret && lcd->power != power)
-				lcd->power = power;
-		}
-	}
-
-	return ret;
-}
-
-static int s6e8ax0_get_power(struct lcd_device *ld)
-{
-	struct s6e8ax0 *lcd = lcd_get_data(ld);
-
-	return lcd->power;
-}
-
-static int s6e8ax0_set_brightness(struct backlight_device *bd)
-{
-	int ret = 0, brightness = bd->props.brightness;
-	struct s6e8ax0 *lcd = bl_get_data(bd);
-
-	if (brightness < MIN_BRIGHTNESS ||
-		brightness > bd->props.max_brightness) {
-		dev_err(lcd->dev, "lcd brightness should be %d to %d.\n",
-			MIN_BRIGHTNESS, MAX_BRIGHTNESS);
-		return -EINVAL;
-	}
-
-	ret = s6e8ax0_gamma_ctrl(lcd, brightness);
-	if (ret) {
-		dev_err(&bd->dev, "lcd brightness setting failed.\n");
-		return -EIO;
-	}
-
-	return ret;
-}
-
-static struct lcd_ops s6e8ax0_lcd_ops = {
-	.set_power = s6e8ax0_set_power,
-	.get_power = s6e8ax0_get_power,
-};
-
-static const struct backlight_ops s6e8ax0_backlight_ops = {
-	.update_status = s6e8ax0_set_brightness,
-};
-
-static void s6e8ax0_power_on(struct mipi_dsim_lcd_device *dsim_dev, int power)
-{
-	struct s6e8ax0 *lcd = dev_get_drvdata(&dsim_dev->dev);
-
-	msleep(lcd->ddi_pd->power_on_delay);
-
-	/* lcd power on */
-	if (power)
-		s6e8ax0_regulator_enable(lcd);
-	else
-		s6e8ax0_regulator_disable(lcd);
-
-	msleep(lcd->ddi_pd->reset_delay);
-
-	/* lcd reset */
-	if (lcd->ddi_pd->reset)
-		lcd->ddi_pd->reset(lcd->ld);
-	msleep(5);
-}
-
-static void s6e8ax0_set_sequence(struct mipi_dsim_lcd_device *dsim_dev)
-{
-	struct s6e8ax0 *lcd = dev_get_drvdata(&dsim_dev->dev);
-
-	s6e8ax0_panel_init(lcd);
-	s6e8ax0_display_on(lcd);
-
-	lcd->power = FB_BLANK_UNBLANK;
-}
-
-static int s6e8ax0_probe(struct mipi_dsim_lcd_device *dsim_dev)
-{
-	struct s6e8ax0 *lcd;
-	int ret;
-	u8 mtp_id[3] = {0, };
-
-	lcd = devm_kzalloc(&dsim_dev->dev, sizeof(struct s6e8ax0), GFP_KERNEL);
-	if (!lcd) {
-		dev_err(&dsim_dev->dev, "failed to allocate s6e8ax0 structure.\n");
-		return -ENOMEM;
-	}
-
-	lcd->dsim_dev = dsim_dev;
-	lcd->ddi_pd = (struct lcd_platform_data *)dsim_dev->platform_data;
-	lcd->dev = &dsim_dev->dev;
-
-	mutex_init(&lcd->lock);
-
-	ret = devm_regulator_bulk_get(lcd->dev, ARRAY_SIZE(supplies), supplies);
-	if (ret) {
-		dev_err(lcd->dev, "Failed to get regulators: %d\n", ret);
-		return ret;
-	}
-
-	lcd->ld = devm_lcd_device_register(lcd->dev, "s6e8ax0", lcd->dev, lcd,
-			&s6e8ax0_lcd_ops);
-	if (IS_ERR(lcd->ld)) {
-		dev_err(lcd->dev, "failed to register lcd ops.\n");
-		return PTR_ERR(lcd->ld);
-	}
-
-	lcd->bd = devm_backlight_device_register(lcd->dev, "s6e8ax0-bl",
-				lcd->dev, lcd, &s6e8ax0_backlight_ops, NULL);
-	if (IS_ERR(lcd->bd)) {
-		dev_err(lcd->dev, "failed to register backlight ops.\n");
-		return PTR_ERR(lcd->bd);
-	}
-
-	lcd->bd->props.max_brightness = MAX_BRIGHTNESS;
-	lcd->bd->props.brightness = MAX_BRIGHTNESS;
-
-	s6e8ax0_read_id(lcd, mtp_id);
-	if (mtp_id[0] == 0x00)
-		dev_err(lcd->dev, "read id failed\n");
-
-	dev_info(lcd->dev, "Read ID : %x, %x, %x\n",
-			mtp_id[0], mtp_id[1], mtp_id[2]);
-
-	if (mtp_id[2] == 0x33)
-		dev_info(lcd->dev,
-			"ID-3 is 0xff does not support dynamic elvss\n");
-	else
-		dev_info(lcd->dev,
-			"ID-3 is 0x%x support dynamic elvss\n", mtp_id[2]);
-
-	lcd->acl_enable = 1;
-	lcd->cur_acl = 0;
-
-	dev_set_drvdata(&dsim_dev->dev, lcd);
-
-	dev_dbg(lcd->dev, "probed s6e8ax0 panel driver.\n");
-
-	return 0;
-}
-
-static int __maybe_unused s6e8ax0_suspend(struct mipi_dsim_lcd_device *dsim_dev)
-{
-	struct s6e8ax0 *lcd = dev_get_drvdata(&dsim_dev->dev);
-
-	s6e8ax0_sleep_in(lcd);
-	msleep(lcd->ddi_pd->power_off_delay);
-	s6e8ax0_display_off(lcd);
-
-	s6e8ax0_regulator_disable(lcd);
-
-	return 0;
-}
-
-static int __maybe_unused s6e8ax0_resume(struct mipi_dsim_lcd_device *dsim_dev)
-{
-	struct s6e8ax0 *lcd = dev_get_drvdata(&dsim_dev->dev);
-
-	s6e8ax0_sleep_out(lcd);
-	msleep(lcd->ddi_pd->power_on_delay);
-
-	s6e8ax0_regulator_enable(lcd);
-	s6e8ax0_set_sequence(dsim_dev);
-
-	return 0;
-}
-
-static struct mipi_dsim_lcd_driver s6e8ax0_dsim_ddi_driver = {
-	.name = "s6e8ax0",
-	.id = -1,
-
-	.power_on = s6e8ax0_power_on,
-	.set_sequence = s6e8ax0_set_sequence,
-	.probe = s6e8ax0_probe,
-	.suspend = IS_ENABLED(CONFIG_PM) ? s6e8ax0_suspend : NULL,
-	.resume = IS_ENABLED(CONFIG_PM) ? s6e8ax0_resume : NULL,
-};
-
-static int s6e8ax0_init(void)
-{
-	exynos_mipi_dsi_register_lcd_driver(&s6e8ax0_dsim_ddi_driver);
-
-	return 0;
-}
-
-static void s6e8ax0_exit(void)
-{
-	return;
-}
-
-module_init(s6e8ax0_init);
-module_exit(s6e8ax0_exit);
-
-MODULE_AUTHOR("Donghwa Lee <dh09.lee@samsung.com>");
-MODULE_AUTHOR("Inki Dae <inki.dae@samsung.com>");
-MODULE_DESCRIPTION("MIPI-DSI based s6e8ax0 AMOLED LCD Panel Driver");
-MODULE_LICENSE("GPL");
diff --git a/drivers/video/fbdev/hecubafb.c b/drivers/video/fbdev/hecubafb.c
index e4031ef39491..8577195cb533 100644
--- a/drivers/video/fbdev/hecubafb.c
+++ b/drivers/video/fbdev/hecubafb.c
@@ -47,7 +47,7 @@
 #define DPY_W 600
 #define DPY_H 800
 
-static struct fb_fix_screeninfo hecubafb_fix = {
+static const struct fb_fix_screeninfo hecubafb_fix = {
 	.id =		"hecubafb",
 	.type =		FB_TYPE_PACKED_PIXELS,
 	.visual =	FB_VISUAL_MONO01,
@@ -58,7 +58,7 @@ static struct fb_fix_screeninfo hecubafb_fix = {
 	.accel =	FB_ACCEL_NONE,
 };
 
-static struct fb_var_screeninfo hecubafb_var = {
+static const struct fb_var_screeninfo hecubafb_var = {
 	.xres		= DPY_W,
 	.yres		= DPY_H,
 	.xres_virtual	= DPY_W,
diff --git a/drivers/video/fbdev/hgafb.c b/drivers/video/fbdev/hgafb.c
index 15d3ccff2965..463028543173 100644
--- a/drivers/video/fbdev/hgafb.c
+++ b/drivers/video/fbdev/hgafb.c
@@ -106,7 +106,7 @@ static DEFINE_SPINLOCK(hga_reg_lock);
 
 /* Framebuffer driver structures */
 
-static struct fb_var_screeninfo hga_default_var = {
+static const struct fb_var_screeninfo hga_default_var = {
 	.xres		= 720,
 	.yres 		= 348,
 	.xres_virtual 	= 720,
diff --git a/drivers/video/fbdev/i740fb.c b/drivers/video/fbdev/i740fb.c
index cf5ccd0f2252..7bc5f6056c77 100644
--- a/drivers/video/fbdev/i740fb.c
+++ b/drivers/video/fbdev/i740fb.c
@@ -82,7 +82,7 @@ struct i740fb_par {
 #define DACSPEED24_SD	128
 #define DACSPEED32	86
 
-static struct fb_fix_screeninfo i740fb_fix = {
+static const struct fb_fix_screeninfo i740fb_fix = {
 	.id =		"i740fb",
 	.type =		FB_TYPE_PACKED_PIXELS,
 	.visual =	FB_VISUAL_TRUECOLOR,
diff --git a/drivers/video/fbdev/i810/i810_main.c b/drivers/video/fbdev/i810/i810_main.c
index 025b882a4826..483ab2592d0c 100644
--- a/drivers/video/fbdev/i810/i810_main.c
+++ b/drivers/video/fbdev/i810/i810_main.c
@@ -1691,7 +1691,7 @@ static int i810_alloc_agp_mem(struct fb_info *info)
 	if (!(par->i810_gtt.i810_cursor_memory = 
 	      agp_allocate_memory(bridge, par->cursor_heap.size >> 12,
 				  AGP_PHYSICAL_MEMORY))) {
-		printk("i810fb_alloc_cursormem:  can't allocate" 
+		printk("i810fb_alloc_cursormem:  can't allocate "
 		       "cursor memory\n");
 		agp_backend_release(bridge);
 		return -ENOMEM;
diff --git a/drivers/video/fbdev/intelfb/intelfbdrv.c b/drivers/video/fbdev/intelfb/intelfbdrv.c
index bf207444ba0c..ff2a5d2023e1 100644
--- a/drivers/video/fbdev/intelfb/intelfbdrv.c
+++ b/drivers/video/fbdev/intelfb/intelfbdrv.c
@@ -1301,11 +1301,6 @@ static int intelfb_check_var(struct fb_var_screeninfo *var,
 		break;
 	}
 
-	if (v.xoffset < 0)
-		v.xoffset = 0;
-	if (v.yoffset < 0)
-		v.yoffset = 0;
-
 	if (v.xoffset > v.xres_virtual - v.xres)
 		v.xoffset = v.xres_virtual - v.xres;
 	if (v.yoffset > v.yres_virtual - v.yres)
diff --git a/drivers/video/fbdev/kyro/fbdev.c b/drivers/video/fbdev/kyro/fbdev.c
index 5bb01533271e..f77478fb3d14 100644
--- a/drivers/video/fbdev/kyro/fbdev.c
+++ b/drivers/video/fbdev/kyro/fbdev.c
@@ -44,7 +44,7 @@ static struct fb_fix_screeninfo kyro_fix = {
 	.accel		= FB_ACCEL_NONE,
 };
 
-static struct fb_var_screeninfo kyro_var = {
+static const struct fb_var_screeninfo kyro_var = {
 	/* 640x480, 16bpp @ 60 Hz */
 	.xres		= 640,
 	.yres		= 480,
diff --git a/drivers/video/fbdev/matrox/matroxfb_Ti3026.c b/drivers/video/fbdev/matrox/matroxfb_Ti3026.c
index 195ad7cac1ba..68fa037d8cbc 100644
--- a/drivers/video/fbdev/matrox/matroxfb_Ti3026.c
+++ b/drivers/video/fbdev/matrox/matroxfb_Ti3026.c
@@ -372,7 +372,7 @@ static int Ti3026_init(struct matrox_fb_info *minfo, struct my_timming *m)
 
 	DBG(__func__)
 
-	memcpy(hw->DACreg, MGADACbpp32, sizeof(hw->DACreg));
+	memcpy(hw->DACreg, MGADACbpp32, sizeof(MGADACbpp32));
 	switch (minfo->fbcon.var.bits_per_pixel) {
 		case 4:	hw->DACreg[POS3026_XLATCHCTRL] = TVP3026_XLATCHCTRL_16_1;	/* or _8_1, they are same */
 			hw->DACreg[POS3026_XTRUECOLORCTRL] = TVP3026_XTRUECOLORCTRL_PSEUDOCOLOR;
diff --git a/drivers/video/fbdev/matrox/matroxfb_g450.c b/drivers/video/fbdev/matrox/matroxfb_g450.c
index cff0546ea6fd..f108ae66fc83 100644
--- a/drivers/video/fbdev/matrox/matroxfb_g450.c
+++ b/drivers/video/fbdev/matrox/matroxfb_g450.c
@@ -433,7 +433,7 @@ static void cve2_init_TVdata(int norm, struct mavenregs* data, const struct outp
 		0x00,	/* 3E written multiple times */
 		0x00,	/* 3F not written */
 	} };
-	static struct mavenregs ntscregs = { {
+	static const struct mavenregs ntscregs = { {
 		0x21, 0xF0, 0x7C, 0x1F,	/* 00: chroma subcarrier */
 		0x00,
 		0x00,	/* test */
diff --git a/drivers/video/fbdev/mb862xx/mb862xx-i2c.c b/drivers/video/fbdev/mb862xx/mb862xx-i2c.c
index c87e17afb3e2..ba96c44f2761 100644
--- a/drivers/video/fbdev/mb862xx/mb862xx-i2c.c
+++ b/drivers/video/fbdev/mb862xx/mb862xx-i2c.c
@@ -157,17 +157,10 @@ static struct i2c_adapter mb862xx_i2c_adapter = {
 
 int mb862xx_i2c_init(struct mb862xxfb_par *par)
 {
-	int ret;
-
 	mb862xx_i2c_adapter.algo_data = par;
 	par->adap = &mb862xx_i2c_adapter;
 
-	ret = i2c_add_adapter(par->adap);
-	if (ret < 0) {
-		dev_err(par->dev, "failed to add %s\n",
-			mb862xx_i2c_adapter.name);
-	}
-	return ret;
+	return i2c_add_adapter(par->adap);
 }
 
 void mb862xx_i2c_exit(struct mb862xxfb_par *par)
diff --git a/drivers/video/fbdev/mx3fb.c b/drivers/video/fbdev/mx3fb.c
index f91b1db262b0..8778e01cebac 100644
--- a/drivers/video/fbdev/mx3fb.c
+++ b/drivers/video/fbdev/mx3fb.c
@@ -845,7 +845,7 @@ static int __set_par(struct fb_info *fbi, bool lock)
 		if (fbi->var.sync & FB_SYNC_SHARP_MODE)
 			mode = IPU_PANEL_SHARP_TFT;
 
-		dev_dbg(fbi->device, "pixclock = %ul Hz\n",
+		dev_dbg(fbi->device, "pixclock = %u Hz\n",
 			(u32) (PICOS2KHZ(fbi->var.pixclock) * 1000UL));
 
 		if (sdc_init_panel(mx3fb, mode,
diff --git a/drivers/video/fbdev/mxsfb.c b/drivers/video/fbdev/mxsfb.c
index 4e6608ceac09..7846f0e8bbbb 100644
--- a/drivers/video/fbdev/mxsfb.c
+++ b/drivers/video/fbdev/mxsfb.c
@@ -800,6 +800,7 @@ static int mxsfb_init_fbinfo(struct mxsfb_info *host,
 			struct fb_videomode *vmode)
 {
 	int ret;
+	struct device *dev = &host->pdev->dev;
 	struct fb_info *fb_info = &host->fb_info;
 	struct fb_var_screeninfo *var = &fb_info->var;
 	dma_addr_t fb_phys;
@@ -825,12 +826,10 @@ static int mxsfb_init_fbinfo(struct mxsfb_info *host,
 
 	/* Memory allocation for framebuffer */
 	fb_size = SZ_2M;
-	fb_virt = alloc_pages_exact(fb_size, GFP_DMA);
+	fb_virt = dma_alloc_wc(dev, PAGE_ALIGN(fb_size), &fb_phys, GFP_KERNEL);
 	if (!fb_virt)
 		return -ENOMEM;
 
-	fb_phys = virt_to_phys(fb_virt);
-
 	fb_info->fix.smem_start = fb_phys;
 	fb_info->screen_base = fb_virt;
 	fb_info->screen_size = fb_info->fix.smem_len = fb_size;
@@ -843,9 +842,11 @@ static int mxsfb_init_fbinfo(struct mxsfb_info *host,
 
 static void mxsfb_free_videomem(struct mxsfb_info *host)
 {
+	struct device *dev = &host->pdev->dev;
 	struct fb_info *fb_info = &host->fb_info;
 
-	free_pages_exact(fb_info->screen_base, fb_info->fix.smem_len);
+	dma_free_wc(dev, fb_info->screen_size, fb_info->screen_base,
+		    fb_info->fix.smem_start);
 }
 
 static const struct platform_device_id mxsfb_devtype[] = {
diff --git a/drivers/video/fbdev/offb.c b/drivers/video/fbdev/offb.c
index fb60a8f0cc94..906c6e75c260 100644
--- a/drivers/video/fbdev/offb.c
+++ b/drivers/video/fbdev/offb.c
@@ -625,6 +625,21 @@ static void __init offb_init_nodriver(struct device_node *dp, int no_real_node)
 	if (address == OF_BAD_ADDR && addr_prop)
 		address = (u64)addr_prop;
 	if (address != OF_BAD_ADDR) {
+#ifdef CONFIG_PCI
+		const __be32 *vidp, *didp;
+		u32 vid, did;
+		struct pci_dev *pdev;
+
+		vidp = of_get_property(dp, "vendor-id", NULL);
+		didp = of_get_property(dp, "device-id", NULL);
+		if (vidp && didp) {
+			vid = be32_to_cpup(vidp);
+			did = be32_to_cpup(didp);
+			pdev = pci_get_device(vid, did, NULL);
+			if (!pdev || pci_enable_device(pdev))
+				return;
+		}
+#endif
 		/* kludge for valkyrie */
 		if (strcmp(dp->name, "valkyrie") == 0)
 			address += 0x1000;
diff --git a/drivers/video/fbdev/omap/lcd_mipid.c b/drivers/video/fbdev/omap/lcd_mipid.c
index 0e4cee9a8d79..c81f150589e1 100644
--- a/drivers/video/fbdev/omap/lcd_mipid.c
+++ b/drivers/video/fbdev/omap/lcd_mipid.c
@@ -60,7 +60,6 @@ struct mipid_device {
 	struct mutex		mutex;
 	struct lcd_panel	panel;
 
-	struct workqueue_struct	*esd_wq;
 	struct delayed_work	esd_work;
 	void			(*esd_check)(struct mipid_device *m);
 };
@@ -390,7 +389,7 @@ static void ls041y3_esd_check(struct mipid_device *md)
 static void mipid_esd_start_check(struct mipid_device *md)
 {
 	if (md->esd_check != NULL)
-		queue_delayed_work(md->esd_wq, &md->esd_work,
+		schedule_delayed_work(&md->esd_work,
 				   MIPID_ESD_CHECK_PERIOD);
 }
 
@@ -476,11 +475,6 @@ static int mipid_init(struct lcd_panel *panel,
 	struct mipid_device *md = to_mipid_device(panel);
 
 	md->fbdev = fbdev;
-	md->esd_wq = create_singlethread_workqueue("mipid_esd");
-	if (md->esd_wq == NULL) {
-		dev_err(&md->spi->dev, "can't create ESD workqueue\n");
-		return -ENOMEM;
-	}
 	INIT_DELAYED_WORK(&md->esd_work, mipid_esd_work);
 	mutex_init(&md->mutex);
 
@@ -500,7 +494,6 @@ static void mipid_cleanup(struct lcd_panel *panel)
 
 	if (md->enabled)
 		mipid_esd_stop_check(md);
-	destroy_workqueue(md->esd_wq);
 }
 
 static struct lcd_panel mipid_panel = {
diff --git a/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c b/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c
index b58012b82b6f..8b810696a42b 100644
--- a/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c
+++ b/drivers/video/fbdev/omap2/omapfb/displays/panel-dsi-cm.c
@@ -75,8 +75,6 @@ struct panel_drv_data {
 
 	bool intro_printed;
 
-	struct workqueue_struct *workqueue;
-
 	bool ulps_enabled;
 	unsigned ulps_timeout;
 	struct delayed_work ulps_work;
@@ -232,7 +230,7 @@ static int dsicm_set_update_window(struct panel_drv_data *ddata,
 static void dsicm_queue_ulps_work(struct panel_drv_data *ddata)
 {
 	if (ddata->ulps_timeout > 0)
-		queue_delayed_work(ddata->workqueue, &ddata->ulps_work,
+		schedule_delayed_work(&ddata->ulps_work,
 				msecs_to_jiffies(ddata->ulps_timeout));
 }
 
@@ -1244,11 +1242,6 @@ static int dsicm_probe(struct platform_device *pdev)
 		dev_dbg(dev, "Using GPIO TE\n");
 	}
 
-	ddata->workqueue = create_singlethread_workqueue("dsicm_wq");
-	if (ddata->workqueue == NULL) {
-		dev_err(dev, "can't create workqueue\n");
-		return -ENOMEM;
-	}
 	INIT_DELAYED_WORK(&ddata->ulps_work, dsicm_ulps_work);
 
 	dsicm_hw_reset(ddata);
@@ -1262,7 +1255,7 @@ static int dsicm_probe(struct platform_device *pdev)
 				dev, ddata, &dsicm_bl_ops, &props);
 		if (IS_ERR(bldev)) {
 			r = PTR_ERR(bldev);
-			goto err_bl;
+			goto err_reg;
 		}
 
 		ddata->bldev = bldev;
@@ -1285,8 +1278,6 @@ static int dsicm_probe(struct platform_device *pdev)
 err_sysfs_create:
 	if (bldev != NULL)
 		backlight_device_unregister(bldev);
-err_bl:
-	destroy_workqueue(ddata->workqueue);
 err_reg:
 	return r;
 }
@@ -1316,7 +1307,6 @@ static int __exit dsicm_remove(struct platform_device *pdev)
 	omap_dss_put_device(ddata->in);
 
 	dsicm_cancel_ulps_work(ddata);
-	destroy_workqueue(ddata->workqueue);
 
 	/* reset, to be sure that the panel is in a valid state */
 	dsicm_hw_reset(ddata);
diff --git a/drivers/video/fbdev/omap2/omapfb/dss/dispc-compat.c b/drivers/video/fbdev/omap2/omapfb/dss/dispc-compat.c
index 3691bde4ce0a..a864608c5df1 100644
--- a/drivers/video/fbdev/omap2/omapfb/dss/dispc-compat.c
+++ b/drivers/video/fbdev/omap2/omapfb/dss/dispc-compat.c
@@ -644,6 +644,7 @@ int omap_dispc_wait_for_irq_interruptible_timeout(u32 irqmask,
 {
 
 	int r;
+	long time_left;
 	DECLARE_COMPLETION_ONSTACK(completion);
 
 	r = omap_dispc_register_isr(dispc_irq_wait_handler, &completion,
@@ -652,15 +653,15 @@ int omap_dispc_wait_for_irq_interruptible_timeout(u32 irqmask,
 	if (r)
 		return r;
 
-	timeout = wait_for_completion_interruptible_timeout(&completion,
+	time_left = wait_for_completion_interruptible_timeout(&completion,
 			timeout);
 
 	omap_dispc_unregister_isr(dispc_irq_wait_handler, &completion, irqmask);
 
-	if (timeout == 0)
+	if (time_left == 0)
 		return -ETIMEDOUT;
 
-	if (timeout == -ERESTARTSYS)
+	if (time_left == -ERESTARTSYS)
 		return -ERESTARTSYS;
 
 	return 0;
diff --git a/drivers/video/fbdev/omap2/omapfb/dss/dsi.c b/drivers/video/fbdev/omap2/omapfb/dss/dsi.c
index 9e4800a4e3d1..30d49f3800b3 100644
--- a/drivers/video/fbdev/omap2/omapfb/dss/dsi.c
+++ b/drivers/video/fbdev/omap2/omapfb/dss/dsi.c
@@ -1167,7 +1167,6 @@ static int dsi_regulator_init(struct platform_device *dsidev)
 {
 	struct dsi_data *dsi = dsi_get_dsidrv_data(dsidev);
 	struct regulator *vdds_dsi;
-	int r;
 
 	if (dsi->vdds_dsi_reg != NULL)
 		return 0;
@@ -1180,13 +1179,6 @@ static int dsi_regulator_init(struct platform_device *dsidev)
 		return PTR_ERR(vdds_dsi);
 	}
 
-	r = regulator_set_voltage(vdds_dsi, 1800000, 1800000);
-	if (r) {
-		devm_regulator_put(vdds_dsi);
-		DSSERR("can't set the DSI regulator voltage\n");
-		return r;
-	}
-
 	dsi->vdds_dsi_reg = vdds_dsi;
 
 	return 0;
@@ -5348,7 +5340,7 @@ static int dsi_bind(struct device *dev, struct device *master, void *data)
 
 	dsi->phy_base = devm_ioremap(&dsidev->dev, res->start,
 		resource_size(res));
-	if (!dsi->proto_base) {
+	if (!dsi->phy_base) {
 		DSSERR("can't ioremap DSI PHY\n");
 		return -ENOMEM;
 	}
@@ -5368,7 +5360,7 @@ static int dsi_bind(struct device *dev, struct device *master, void *data)
 
 	dsi->pll_base = devm_ioremap(&dsidev->dev, res->start,
 		resource_size(res));
-	if (!dsi->proto_base) {
+	if (!dsi->pll_base) {
 		DSSERR("can't ioremap DSI PLL\n");
 		return -ENOMEM;
 	}
diff --git a/drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c b/drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c
index 926a6f20dbb2..156a254705ea 100644
--- a/drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c
+++ b/drivers/video/fbdev/omap2/omapfb/dss/hdmi4.c
@@ -100,7 +100,6 @@ static irqreturn_t hdmi_irq_handler(int irq, void *data)
 
 static int hdmi_init_regulator(void)
 {
-	int r;
 	struct regulator *reg;
 
 	if (hdmi.vdda_reg != NULL)
@@ -114,13 +113,6 @@ static int hdmi_init_regulator(void)
 		return PTR_ERR(reg);
 	}
 
-	r = regulator_set_voltage(reg, 1800000, 1800000);
-	if (r) {
-		devm_regulator_put(reg);
-		DSSWARN("can't set the regulator voltage\n");
-		return r;
-	}
-
 	hdmi.vdda_reg = reg;
 
 	return 0;
diff --git a/drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c b/drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c
index 0ee829a165c3..4da36bcab977 100644
--- a/drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c
+++ b/drivers/video/fbdev/omap2/omapfb/dss/hdmi5.c
@@ -119,7 +119,6 @@ static irqreturn_t hdmi_irq_handler(int irq, void *data)
 
 static int hdmi_init_regulator(void)
 {
-	int r;
 	struct regulator *reg;
 
 	if (hdmi.vdda_reg != NULL)
@@ -131,13 +130,6 @@ static int hdmi_init_regulator(void)
 		return PTR_ERR(reg);
 	}
 
-	r = regulator_set_voltage(reg, 1800000, 1800000);
-	if (r) {
-		devm_regulator_put(reg);
-		DSSWARN("can't set the regulator voltage\n");
-		return r;
-	}
-
 	hdmi.vdda_reg = reg;
 
 	return 0;
diff --git a/drivers/video/fbdev/pm2fb.c b/drivers/video/fbdev/pm2fb.c
index aa8d28880912..1a4070f719c2 100644
--- a/drivers/video/fbdev/pm2fb.c
+++ b/drivers/video/fbdev/pm2fb.c
@@ -113,7 +113,7 @@ static struct fb_fix_screeninfo pm2fb_fix = {
 /*
  * Default video mode. In case the modedb doesn't work.
  */
-static struct fb_var_screeninfo pm2fb_var = {
+static const struct fb_var_screeninfo pm2fb_var = {
 	/* "640x480, 8 bpp @ 60 Hz */
 	.xres =			640,
 	.yres =			480,
diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c
index 2c0487f4f805..ef73f14d7ba0 100644
--- a/drivers/video/fbdev/pxafb.c
+++ b/drivers/video/fbdev/pxafb.c
@@ -2125,7 +2125,7 @@ static int of_get_pxafb_display(struct device *dev, struct device_node *disp,
 
 	timings = of_get_display_timings(disp);
 	if (!timings)
-		goto out;
+		return -EINVAL;
 
 	ret = -ENOMEM;
 	info->modes = kmalloc_array(timings->num_timings,
@@ -2186,6 +2186,7 @@ static int of_get_pxafb_mode_info(struct device *dev,
 	ret = of_property_read_u32(np, "bus-width", &bus_width);
 	if (ret) {
 		dev_err(dev, "no bus-width specified: %d\n", ret);
+		of_node_put(np);
 		return ret;
 	}
 
diff --git a/drivers/video/fbdev/s1d13xxxfb.c b/drivers/video/fbdev/s1d13xxxfb.c
index 96aa46dc696c..5d6179ef0298 100644
--- a/drivers/video/fbdev/s1d13xxxfb.c
+++ b/drivers/video/fbdev/s1d13xxxfb.c
@@ -83,7 +83,7 @@ static const char *s1d13xxxfb_prod_names[] = {
 /*
  * here we define the default struct fb_fix_screeninfo
  */
-static struct fb_fix_screeninfo s1d13xxxfb_fix = {
+static const struct fb_fix_screeninfo s1d13xxxfb_fix = {
 	.id		= S1D_FBID,
 	.type		= FB_TYPE_PACKED_PIXELS,
 	.visual		= FB_VISUAL_PSEUDOCOLOR,
@@ -929,7 +929,7 @@ static int s1d13xxxfb_suspend(struct platform_device *dev, pm_message_t state)
 		s1dfb->disp_save = kmalloc(info->fix.smem_len, GFP_KERNEL);
 
 	if (!s1dfb->disp_save) {
-		printk(KERN_ERR PFX "no memory to save screen");
+		printk(KERN_ERR PFX "no memory to save screen\n");
 		return -ENOMEM;
 	}
 
diff --git a/drivers/video/fbdev/s3c2410fb.c b/drivers/video/fbdev/s3c2410fb.c
index 0dd86be36afb..a67e4567e656 100644
--- a/drivers/video/fbdev/s3c2410fb.c
+++ b/drivers/video/fbdev/s3c2410fb.c
@@ -767,7 +767,7 @@ static irqreturn_t s3c2410fb_irq(int irq, void *dev_id)
 	return IRQ_HANDLED;
 }
 
-#ifdef CONFIG_CPU_FREQ
+#ifdef CONFIG_ARM_S3C24XX_CPUFREQ
 
 static int s3c2410fb_cpufreq_transition(struct notifier_block *nb,
 					unsigned long val, void *data)
diff --git a/drivers/video/fbdev/s3c2410fb.h b/drivers/video/fbdev/s3c2410fb.h
index 47a17bd23011..cdd11e2f8859 100644
--- a/drivers/video/fbdev/s3c2410fb.h
+++ b/drivers/video/fbdev/s3c2410fb.h
@@ -32,7 +32,7 @@ struct s3c2410fb_info {
 	unsigned long		clk_rate;
 	unsigned int		palette_ready;
 
-#ifdef CONFIG_CPU_FREQ
+#ifdef CONFIG_ARM_S3C24XX_CPUFREQ
 	struct notifier_block	freq_transition;
 #endif
 
diff --git a/drivers/video/fbdev/savage/savagefb_driver.c b/drivers/video/fbdev/savage/savagefb_driver.c
index 6c77ab09b0b2..c30a91c1137c 100644
--- a/drivers/video/fbdev/savage/savagefb_driver.c
+++ b/drivers/video/fbdev/savage/savagefb_driver.c
@@ -1660,7 +1660,7 @@ static struct fb_ops savagefb_ops = {
 
 /* --------------------------------------------------------------------- */
 
-static struct fb_var_screeninfo savagefb_var800x600x8 = {
+static const struct fb_var_screeninfo savagefb_var800x600x8 = {
 	.accel_flags =	FB_ACCELF_TEXT,
 	.xres =		800,
 	.yres =		600,
diff --git a/drivers/video/fbdev/simplefb.c b/drivers/video/fbdev/simplefb.c
index e9cf19977285..61f799a515dc 100644
--- a/drivers/video/fbdev/simplefb.c
+++ b/drivers/video/fbdev/simplefb.c
@@ -33,14 +33,14 @@
 #include <linux/parser.h>
 #include <linux/regulator/consumer.h>
 
-static struct fb_fix_screeninfo simplefb_fix = {
+static const struct fb_fix_screeninfo simplefb_fix = {
 	.id		= "simple",
 	.type		= FB_TYPE_PACKED_PIXELS,
 	.visual		= FB_VISUAL_TRUECOLOR,
 	.accel		= FB_ACCEL_NONE,
 };
 
-static struct fb_var_screeninfo simplefb_var = {
+static const struct fb_var_screeninfo simplefb_var = {
 	.height		= -1,
 	.width		= -1,
 	.activate	= FB_ACTIVATE_NOW,
@@ -74,8 +74,14 @@ static int simplefb_setcolreg(u_int regno, u_int red, u_int green, u_int blue,
 	return 0;
 }
 
+struct simplefb_par;
+static void simplefb_clocks_destroy(struct simplefb_par *par);
+static void simplefb_regulators_destroy(struct simplefb_par *par);
+
 static void simplefb_destroy(struct fb_info *info)
 {
+	simplefb_regulators_destroy(info->par);
+	simplefb_clocks_destroy(info->par);
 	if (info->screen_base)
 		iounmap(info->screen_base);
 }
@@ -487,11 +493,8 @@ error_fb_release:
 static int simplefb_remove(struct platform_device *pdev)
 {
 	struct fb_info *info = platform_get_drvdata(pdev);
-	struct simplefb_par *par = info->par;
 
 	unregister_framebuffer(info);
-	simplefb_regulators_destroy(par);
-	simplefb_clocks_destroy(par);
 	framebuffer_release(info);
 
 	return 0;
diff --git a/drivers/video/fbdev/sm712fb.c b/drivers/video/fbdev/sm712fb.c
index 86ae1d4556fc..73cb4ffff3c5 100644
--- a/drivers/video/fbdev/sm712fb.c
+++ b/drivers/video/fbdev/sm712fb.c
@@ -56,7 +56,7 @@ struct smtcfb_info {
 
 void __iomem *smtc_regbaseaddress;	/* Memory Map IO starting address */
 
-static struct fb_var_screeninfo smtcfb_var = {
+static const struct fb_var_screeninfo smtcfb_var = {
 	.xres           = 1024,
 	.yres           = 600,
 	.xres_virtual   = 1024,
diff --git a/drivers/video/fbdev/smscufx.c b/drivers/video/fbdev/smscufx.c
index 9279e5f6696e..ec2e7e353685 100644
--- a/drivers/video/fbdev/smscufx.c
+++ b/drivers/video/fbdev/smscufx.c
@@ -1761,10 +1761,8 @@ error:
 static void ufx_usb_disconnect(struct usb_interface *interface)
 {
 	struct ufx_data *dev;
-	struct fb_info *info;
 
 	dev = usb_get_intfdata(interface);
-	info = dev->info;
 
 	pr_debug("USB disconnect starting\n");
 
diff --git a/drivers/video/fbdev/ssd1307fb.c b/drivers/video/fbdev/ssd1307fb.c
index a9c45c89b15e..2925d5ce8d3e 100644
--- a/drivers/video/fbdev/ssd1307fb.c
+++ b/drivers/video/fbdev/ssd1307fb.c
@@ -64,7 +64,7 @@ struct ssd1307fb_par {
 	u32 contrast;
 	u32 dclk_div;
 	u32 dclk_frq;
-	struct ssd1307fb_deviceinfo *device_info;
+	const struct ssd1307fb_deviceinfo *device_info;
 	struct i2c_client *client;
 	u32 height;
 	struct fb_info *info;
@@ -84,7 +84,7 @@ struct ssd1307fb_array {
 	u8	data[0];
 };
 
-static struct fb_fix_screeninfo ssd1307fb_fix = {
+static const struct fb_fix_screeninfo ssd1307fb_fix = {
 	.id		= "Solomon SSD1307",
 	.type		= FB_TYPE_PACKED_PIXELS,
 	.visual		= FB_VISUAL_MONO10,
@@ -94,7 +94,7 @@ static struct fb_fix_screeninfo ssd1307fb_fix = {
 	.accel		= FB_ACCEL_NONE,
 };
 
-static struct fb_var_screeninfo ssd1307fb_var = {
+static const struct fb_var_screeninfo ssd1307fb_var = {
 	.bits_per_pixel	= 1,
 };
 
@@ -559,8 +559,7 @@ static int ssd1307fb_probe(struct i2c_client *client,
 	par->info = info;
 	par->client = client;
 
-	par->device_info = (struct ssd1307fb_deviceinfo *)of_match_device(
-			ssd1307fb_of_match, &client->dev)->data;
+	par->device_info = of_device_get_match_data(&client->dev);
 
 	par->reset = of_get_named_gpio(client->dev.of_node,
 					 "reset-gpios", 0);
diff --git a/drivers/video/fbdev/tdfxfb.c b/drivers/video/fbdev/tdfxfb.c
index 621fa441a6db..d5fa313806fe 100644
--- a/drivers/video/fbdev/tdfxfb.c
+++ b/drivers/video/fbdev/tdfxfb.c
@@ -82,7 +82,7 @@
 #define VOODOO3_MAX_PIXCLOCK 300000
 #define VOODOO5_MAX_PIXCLOCK 350000
 
-static struct fb_fix_screeninfo tdfx_fix = {
+static const struct fb_fix_screeninfo tdfx_fix = {
 	.type =		FB_TYPE_PACKED_PIXELS,
 	.visual =	FB_VISUAL_PSEUDOCOLOR,
 	.ypanstep =	1,
@@ -90,7 +90,7 @@ static struct fb_fix_screeninfo tdfx_fix = {
 	.accel =	FB_ACCEL_3DFX_BANSHEE
 };
 
-static struct fb_var_screeninfo tdfx_var = {
+static const struct fb_var_screeninfo tdfx_var = {
 	/* "640x480, 8 bpp @ 60 Hz */
 	.xres =		640,
 	.yres =		480,
diff --git a/drivers/video/fbdev/uvesafb.c b/drivers/video/fbdev/uvesafb.c
index 178ae93b7ebd..98af9e02959b 100644
--- a/drivers/video/fbdev/uvesafb.c
+++ b/drivers/video/fbdev/uvesafb.c
@@ -33,7 +33,7 @@ static struct cb_id uvesafb_cn_id = {
 static char v86d_path[PATH_MAX] = "/sbin/v86d";
 static char v86d_started;	/* has v86d been started by uvesafb? */
 
-static struct fb_fix_screeninfo uvesafb_fix = {
+static const struct fb_fix_screeninfo uvesafb_fix = {
 	.id	= "VESA VGA",
 	.type	= FB_TYPE_PACKED_PIXELS,
 	.accel	= FB_ACCEL_NONE,
diff --git a/drivers/video/fbdev/vfb.c b/drivers/video/fbdev/vfb.c
index b9c2f81fb6b9..da653a080394 100644
--- a/drivers/video/fbdev/vfb.c
+++ b/drivers/video/fbdev/vfb.c
@@ -35,76 +35,23 @@
 static void *videomemory;
 static u_long videomemorysize = VIDEOMEMSIZE;
 module_param(videomemorysize, ulong, 0);
+MODULE_PARM_DESC(videomemorysize, "RAM available to frame buffer (in bytes)");
 
-/**********************************************************************
- *
- * Memory management
- *
- **********************************************************************/
-static void *rvmalloc(unsigned long size)
-{
-	void *mem;
-	unsigned long adr;
-
-	size = PAGE_ALIGN(size);
-	mem = vmalloc_32(size);
-	if (!mem)
-		return NULL;
-
-	/*
-	 * VFB must clear memory to prevent kernel info
-	 * leakage into userspace
-	 * VGA-based drivers MUST NOT clear memory if
-	 * they want to be able to take over vgacon
-	 */
-
-	memset(mem, 0, size);
-	adr = (unsigned long) mem;
-	while (size > 0) {
-		SetPageReserved(vmalloc_to_page((void *)adr));
-		adr += PAGE_SIZE;
-		size -= PAGE_SIZE;
-	}
-
-	return mem;
-}
-
-static void rvfree(void *mem, unsigned long size)
-{
-	unsigned long adr;
-
-	if (!mem)
-		return;
-
-	adr = (unsigned long) mem;
-	while ((long) size > 0) {
-		ClearPageReserved(vmalloc_to_page((void *)adr));
-		adr += PAGE_SIZE;
-		size -= PAGE_SIZE;
-	}
-	vfree(mem);
-}
+static char *mode_option = NULL;
+module_param(mode_option, charp, 0);
+MODULE_PARM_DESC(mode_option, "Preferred video mode (e.g. 640x480-8@60)");
 
-static struct fb_var_screeninfo vfb_default = {
+static const struct fb_videomode vfb_default = {
 	.xres =		640,
 	.yres =		480,
-	.xres_virtual =	640,
-	.yres_virtual =	480,
-	.bits_per_pixel = 8,
-	.red =		{ 0, 8, 0 },
-      	.green =	{ 0, 8, 0 },
-      	.blue =		{ 0, 8, 0 },
-      	.activate =	FB_ACTIVATE_TEST,
-      	.height =	-1,
-      	.width =	-1,
-      	.pixclock =	20000,
-      	.left_margin =	64,
-      	.right_margin =	64,
-      	.upper_margin =	32,
-      	.lower_margin =	32,
-      	.hsync_len =	64,
-      	.vsync_len =	2,
-      	.vmode =	FB_VMODE_NONINTERLACED,
+	.pixclock =	20000,
+	.left_margin =	64,
+	.right_margin =	64,
+	.upper_margin =	32,
+	.lower_margin =	32,
+	.hsync_len =	64,
+	.vsync_len =	2,
+	.vmode =	FB_VMODE_NONINTERLACED,
 };
 
 static struct fb_fix_screeninfo vfb_fix = {
@@ -119,6 +66,7 @@ static struct fb_fix_screeninfo vfb_fix = {
 
 static bool vfb_enable __initdata = 0;	/* disabled by default */
 module_param(vfb_enable, bool, 0);
+MODULE_PARM_DESC(vfb_enable, "Enable Virtual FB driver");
 
 static int vfb_check_var(struct fb_var_screeninfo *var,
 			 struct fb_info *info);
@@ -421,35 +369,7 @@ static int vfb_pan_display(struct fb_var_screeninfo *var,
 static int vfb_mmap(struct fb_info *info,
 		    struct vm_area_struct *vma)
 {
-	unsigned long start = vma->vm_start;
-	unsigned long size = vma->vm_end - vma->vm_start;
-	unsigned long offset = vma->vm_pgoff << PAGE_SHIFT;
-	unsigned long page, pos;
-
-	if (vma->vm_pgoff > (~0UL >> PAGE_SHIFT))
-		return -EINVAL;
-	if (size > info->fix.smem_len)
-		return -EINVAL;
-	if (offset > info->fix.smem_len - size)
-		return -EINVAL;
-
-	pos = (unsigned long)info->fix.smem_start + offset;
-
-	while (size > 0) {
-		page = vmalloc_to_pfn((void *)pos);
-		if (remap_pfn_range(vma, start, page, PAGE_SIZE, PAGE_SHARED)) {
-			return -EAGAIN;
-		}
-		start += PAGE_SIZE;
-		pos += PAGE_SIZE;
-		if (size > PAGE_SIZE)
-			size -= PAGE_SIZE;
-		else
-			size = 0;
-	}
-
-	return 0;
-
+	return remap_vmalloc_range(vma, (void *)info->fix.smem_start, vma->vm_pgoff);
 }
 
 #ifndef MODULE
@@ -477,6 +397,8 @@ static int __init vfb_setup(char *options)
 		/* Test disable for backwards compatibility */
 		if (!strcmp(this_opt, "disable"))
 			vfb_enable = 0;
+		else
+			mode_option = this_opt;
 	}
 	return 1;
 }
@@ -489,12 +411,13 @@ static int __init vfb_setup(char *options)
 static int vfb_probe(struct platform_device *dev)
 {
 	struct fb_info *info;
+	unsigned int size = PAGE_ALIGN(videomemorysize);
 	int retval = -ENOMEM;
 
 	/*
 	 * For real video cards we use ioremap.
 	 */
-	if (!(videomemory = rvmalloc(videomemorysize)))
+	if (!(videomemory = vmalloc_32_user(size)))
 		return retval;
 
 	info = framebuffer_alloc(sizeof(u32) * 256, &dev->dev);
@@ -504,11 +427,13 @@ static int vfb_probe(struct platform_device *dev)
 	info->screen_base = (char __iomem *)videomemory;
 	info->fbops = &vfb_ops;
 
-	retval = fb_find_mode(&info->var, info, NULL,
-			      NULL, 0, NULL, 8);
+	if (!fb_find_mode(&info->var, info, mode_option,
+			  NULL, 0, &vfb_default, 8)){
+		fb_err(info, "Unable to find usable video mode.\n");
+		retval = -EINVAL;
+		goto err1;
+	}
 
-	if (!retval || (retval == 4))
-		info->var = vfb_default;
 	vfb_fix.smem_start = (unsigned long) videomemory;
 	vfb_fix.smem_len = videomemorysize;
 	info->fix = vfb_fix;
@@ -533,7 +458,7 @@ err2:
 err1:
 	framebuffer_release(info);
 err:
-	rvfree(videomemory, videomemorysize);
+	vfree(videomemory);
 	return retval;
 }
 
@@ -543,7 +468,7 @@ static int vfb_remove(struct platform_device *dev)
 
 	if (info) {
 		unregister_framebuffer(info);
-		rvfree(videomemory, videomemorysize);
+		vfree(videomemory);
 		fb_dealloc_cmap(&info->cmap);
 		framebuffer_release(info);
 	}
diff --git a/drivers/video/fbdev/vga16fb.c b/drivers/video/fbdev/vga16fb.c
index 283d335a759f..5f0690c8fc93 100644
--- a/drivers/video/fbdev/vga16fb.c
+++ b/drivers/video/fbdev/vga16fb.c
@@ -85,7 +85,7 @@ static struct fb_var_screeninfo vga16fb_defined = {
 };
 
 /* name should not depend on EGA/VGA */
-static struct fb_fix_screeninfo vga16fb_fix = {
+static const struct fb_fix_screeninfo vga16fb_fix = {
 	.id		= "VGA16 VGA",
 	.smem_start	= VGA_FB_PHYS,
 	.smem_len	= VGA_FB_PHYS_LEN,
diff --git a/drivers/watchdog/Kconfig b/drivers/watchdog/Kconfig
index 50dbaa805658..fdd3228e0678 100644
--- a/drivers/watchdog/Kconfig
+++ b/drivers/watchdog/Kconfig
@@ -1844,4 +1844,53 @@ config USBPCWATCHDOG
 
 	  Most people will say N.
 
+comment "Watchdog Pretimeout Governors"
+
+config WATCHDOG_PRETIMEOUT_GOV
+	bool "Enable watchdog pretimeout governors"
+	help
+	  The option allows to select watchdog pretimeout governors.
+
+if WATCHDOG_PRETIMEOUT_GOV
+
+choice
+	prompt "Default Watchdog Pretimeout Governor"
+	default WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC
+	help
+	  This option selects a default watchdog pretimeout governor.
+	  The governor takes its action, if a watchdog is capable
+	  to report a pretimeout event.
+
+config WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP
+	bool "noop"
+	select WATCHDOG_PRETIMEOUT_GOV_NOOP
+	help
+	  Use noop watchdog pretimeout governor by default. If noop
+	  governor is selected by a user, write a short message to
+	  the kernel log buffer and don't do any system changes.
+
+config WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC
+	bool "panic"
+	select WATCHDOG_PRETIMEOUT_GOV_PANIC
+	help
+	  Use panic watchdog pretimeout governor by default, if
+	  a watchdog pretimeout event happens, consider that
+	  a watchdog feeder is dead and reboot is unavoidable.
+
+endchoice
+
+config WATCHDOG_PRETIMEOUT_GOV_NOOP
+	tristate "Noop watchdog pretimeout governor"
+	help
+	  Noop watchdog pretimeout governor, only an informational
+	  message is added to kernel log buffer.
+
+config WATCHDOG_PRETIMEOUT_GOV_PANIC
+	tristate "Panic watchdog pretimeout governor"
+	help
+	  Panic watchdog pretimeout governor, on watchdog pretimeout
+	  event put the kernel into panic.
+
+endif # WATCHDOG_PRETIMEOUT_GOV
+
 endif # WATCHDOG
diff --git a/drivers/watchdog/Makefile b/drivers/watchdog/Makefile
index cba00430151b..caa9f4aa492a 100644
--- a/drivers/watchdog/Makefile
+++ b/drivers/watchdog/Makefile
@@ -3,9 +3,15 @@
 #
 
 # The WatchDog Timer Driver Core.
-watchdog-objs	+= watchdog_core.o watchdog_dev.o
 obj-$(CONFIG_WATCHDOG_CORE)	+= watchdog.o
 
+watchdog-objs	+= watchdog_core.o watchdog_dev.o
+
+watchdog-$(CONFIG_WATCHDOG_PRETIMEOUT_GOV)	+= watchdog_pretimeout.o
+
+obj-$(CONFIG_WATCHDOG_PRETIMEOUT_GOV_NOOP)	+= pretimeout_noop.o
+obj-$(CONFIG_WATCHDOG_PRETIMEOUT_GOV_PANIC)	+= pretimeout_panic.o
+
 # Only one watchdog can succeed. We probe the ISA/PCI/USB based
 # watchdog-cards first, then the architecture specific watchdog
 # drivers and then the architecture independent "softdog" driver.
diff --git a/drivers/watchdog/asm9260_wdt.c b/drivers/watchdog/asm9260_wdt.c
index c9686b2fdafd..d0b59ba0f661 100644
--- a/drivers/watchdog/asm9260_wdt.c
+++ b/drivers/watchdog/asm9260_wdt.c
@@ -389,7 +389,6 @@ MODULE_DEVICE_TABLE(of, asm9260_wdt_of_match);
 static struct platform_driver asm9260_wdt_driver = {
 	.driver = {
 		.name = "asm9260-wdt",
-		.owner = THIS_MODULE,
 		.of_match_table	= asm9260_wdt_of_match,
 	},
 	.probe = asm9260_wdt_probe,
diff --git a/drivers/watchdog/ath79_wdt.c b/drivers/watchdog/ath79_wdt.c
index 835d310081e1..e2209bf5fa8a 100644
--- a/drivers/watchdog/ath79_wdt.c
+++ b/drivers/watchdog/ath79_wdt.c
@@ -35,6 +35,7 @@
 #include <linux/err.h>
 #include <linux/of.h>
 #include <linux/of_platform.h>
+#include <linux/uaccess.h>
 
 #define DRIVER_NAME	"ath79-wdt"
 
diff --git a/drivers/watchdog/bcm7038_wdt.c b/drivers/watchdog/bcm7038_wdt.c
index 4245b65d645c..e238df4d75a2 100644
--- a/drivers/watchdog/bcm7038_wdt.c
+++ b/drivers/watchdog/bcm7038_wdt.c
@@ -107,7 +107,7 @@ static struct watchdog_info bcm7038_wdt_info = {
 				WDIOF_MAGICCLOSE
 };
 
-static struct watchdog_ops bcm7038_wdt_ops = {
+static const struct watchdog_ops bcm7038_wdt_ops = {
 	.owner		= THIS_MODULE,
 	.start		= bcm7038_wdt_start,
 	.stop		= bcm7038_wdt_stop,
diff --git a/drivers/watchdog/cadence_wdt.c b/drivers/watchdog/cadence_wdt.c
index 4dda9024e229..98acef72334d 100644
--- a/drivers/watchdog/cadence_wdt.c
+++ b/drivers/watchdog/cadence_wdt.c
@@ -269,7 +269,7 @@ static struct watchdog_info cdns_wdt_info = {
 };
 
 /* Watchdog Core Ops */
-static struct watchdog_ops cdns_wdt_ops = {
+static const struct watchdog_ops cdns_wdt_ops = {
 	.owner = THIS_MODULE,
 	.start = cdns_wdt_start,
 	.stop = cdns_wdt_stop,
@@ -424,8 +424,10 @@ static int __maybe_unused cdns_wdt_suspend(struct device *dev)
 	struct platform_device *pdev = to_platform_device(dev);
 	struct cdns_wdt *wdt = platform_get_drvdata(pdev);
 
-	cdns_wdt_stop(&wdt->cdns_wdt_device);
-	clk_disable_unprepare(wdt->clk);
+	if (watchdog_active(&wdt->cdns_wdt_device)) {
+		cdns_wdt_stop(&wdt->cdns_wdt_device);
+		clk_disable_unprepare(wdt->clk);
+	}
 
 	return 0;
 }
@@ -442,12 +444,14 @@ static int __maybe_unused cdns_wdt_resume(struct device *dev)
 	struct platform_device *pdev = to_platform_device(dev);
 	struct cdns_wdt *wdt = platform_get_drvdata(pdev);
 
-	ret = clk_prepare_enable(wdt->clk);
-	if (ret) {
-		dev_err(dev, "unable to enable clock\n");
-		return ret;
+	if (watchdog_active(&wdt->cdns_wdt_device)) {
+		ret = clk_prepare_enable(wdt->clk);
+		if (ret) {
+			dev_err(dev, "unable to enable clock\n");
+			return ret;
+		}
+		cdns_wdt_start(&wdt->cdns_wdt_device);
 	}
-	cdns_wdt_start(&wdt->cdns_wdt_device);
 
 	return 0;
 }
diff --git a/drivers/watchdog/dw_wdt.c b/drivers/watchdog/dw_wdt.c
index 2acb51cf5504..3c6a3de13a1b 100644
--- a/drivers/watchdog/dw_wdt.c
+++ b/drivers/watchdog/dw_wdt.c
@@ -54,6 +54,7 @@ MODULE_PARM_DESC(nowayout, "Watchdog cannot be stopped once started "
 struct dw_wdt {
 	void __iomem		*regs;
 	struct clk		*clk;
+	unsigned long		rate;
 	struct notifier_block	restart_handler;
 	struct watchdog_device	wdd;
 };
@@ -72,7 +73,7 @@ static inline int dw_wdt_top_in_seconds(struct dw_wdt *dw_wdt, unsigned top)
 	 * There are 16 possible timeout values in 0..15 where the number of
 	 * cycles is 2 ^ (16 + i) and the watchdog counts down.
 	 */
-	return (1U << (16 + top)) / clk_get_rate(dw_wdt->clk);
+	return (1U << (16 + top)) / dw_wdt->rate;
 }
 
 static int dw_wdt_get_top(struct dw_wdt *dw_wdt)
@@ -163,7 +164,7 @@ static unsigned int dw_wdt_get_timeleft(struct watchdog_device *wdd)
 	struct dw_wdt *dw_wdt = to_dw_wdt(wdd);
 
 	return readl(dw_wdt->regs + WDOG_CURRENT_COUNT_REG_OFFSET) /
-		clk_get_rate(dw_wdt->clk);
+		dw_wdt->rate;
 }
 
 static const struct watchdog_info dw_wdt_ident = {
@@ -231,6 +232,12 @@ static int dw_wdt_drv_probe(struct platform_device *pdev)
 	if (ret)
 		return ret;
 
+	dw_wdt->rate = clk_get_rate(dw_wdt->clk);
+	if (dw_wdt->rate == 0) {
+		ret = -EINVAL;
+		goto out_disable_clk;
+	}
+
 	wdd = &dw_wdt->wdd;
 	wdd->info = &dw_wdt_ident;
 	wdd->ops = &dw_wdt_ops;
diff --git a/drivers/watchdog/hpwdt.c b/drivers/watchdog/hpwdt.c
index 8f89bd8a826a..70c7194e2810 100644
--- a/drivers/watchdog/hpwdt.c
+++ b/drivers/watchdog/hpwdt.c
@@ -39,7 +39,7 @@
 #include <asm/nmi.h>
 #include <asm/frame.h>
 
-#define HPWDT_VERSION			"1.3.3"
+#define HPWDT_VERSION			"1.4.0"
 #define SECS_TO_TICKS(secs)		((secs) * 1000 / 128)
 #define TICKS_TO_SECS(ticks)		((ticks) * 128 / 1000)
 #define HPWDT_MAX_TIMER			TICKS_TO_SECS(65535)
@@ -814,7 +814,8 @@ static int hpwdt_init_one(struct pci_dev *dev,
 	 * not run on a legacy ASM box.
 	 * So we only support the G5 ProLiant servers and higher.
 	 */
-	if (dev->subsystem_vendor != PCI_VENDOR_ID_HP) {
+	if (dev->subsystem_vendor != PCI_VENDOR_ID_HP &&
+	    dev->subsystem_vendor != PCI_VENDOR_ID_HP_3PAR) {
 		dev_warn(&dev->dev,
 			"This server does not have an iLO2+ ASIC.\n");
 		return -ENODEV;
@@ -823,7 +824,8 @@ static int hpwdt_init_one(struct pci_dev *dev,
 	/*
 	 * Ignore all auxilary iLO devices with the following PCI ID
 	 */
-	if (dev->subsystem_device == 0x1979)
+	if (dev->subsystem_vendor == PCI_VENDOR_ID_HP &&
+	    dev->subsystem_device == 0x1979)
 		return -ENODEV;
 
 	if (pci_enable_device(dev)) {
diff --git a/drivers/watchdog/iTCO_wdt.c b/drivers/watchdog/iTCO_wdt.c
index 54cab189a763..06fcb6c8c917 100644
--- a/drivers/watchdog/iTCO_wdt.c
+++ b/drivers/watchdog/iTCO_wdt.c
@@ -629,7 +629,7 @@ static int iTCO_wdt_resume_noirq(struct device *dev)
 	return 0;
 }
 
-static struct dev_pm_ops iTCO_wdt_pm = {
+static const struct dev_pm_ops iTCO_wdt_pm = {
 	.suspend_noirq = iTCO_wdt_suspend_noirq,
 	.resume_noirq = iTCO_wdt_resume_noirq,
 };
diff --git a/drivers/watchdog/imx2_wdt.c b/drivers/watchdog/imx2_wdt.c
index 62f346bb4348..4874b0f18650 100644
--- a/drivers/watchdog/imx2_wdt.c
+++ b/drivers/watchdog/imx2_wdt.c
@@ -24,6 +24,7 @@
 #include <linux/clk.h>
 #include <linux/delay.h>
 #include <linux/init.h>
+#include <linux/interrupt.h>
 #include <linux/io.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
@@ -37,18 +38,23 @@
 
 #define IMX2_WDT_WCR		0x00		/* Control Register */
 #define IMX2_WDT_WCR_WT		(0xFF << 8)	/* -> Watchdog Timeout Field */
-#define IMX2_WDT_WCR_WDA	(1 << 5)	/* -> External Reset WDOG_B */
-#define IMX2_WDT_WCR_SRS	(1 << 4)	/* -> Software Reset Signal */
-#define IMX2_WDT_WCR_WRE	(1 << 3)	/* -> WDOG Reset Enable */
-#define IMX2_WDT_WCR_WDE	(1 << 2)	/* -> Watchdog Enable */
-#define IMX2_WDT_WCR_WDZST	(1 << 0)	/* -> Watchdog timer Suspend */
+#define IMX2_WDT_WCR_WDA	BIT(5)		/* -> External Reset WDOG_B */
+#define IMX2_WDT_WCR_SRS	BIT(4)		/* -> Software Reset Signal */
+#define IMX2_WDT_WCR_WRE	BIT(3)		/* -> WDOG Reset Enable */
+#define IMX2_WDT_WCR_WDE	BIT(2)		/* -> Watchdog Enable */
+#define IMX2_WDT_WCR_WDZST	BIT(0)		/* -> Watchdog timer Suspend */
 
 #define IMX2_WDT_WSR		0x02		/* Service Register */
 #define IMX2_WDT_SEQ1		0x5555		/* -> service sequence 1 */
 #define IMX2_WDT_SEQ2		0xAAAA		/* -> service sequence 2 */
 
 #define IMX2_WDT_WRSR		0x04		/* Reset Status Register */
-#define IMX2_WDT_WRSR_TOUT	(1 << 1)	/* -> Reset due to Timeout */
+#define IMX2_WDT_WRSR_TOUT	BIT(1)		/* -> Reset due to Timeout */
+
+#define IMX2_WDT_WICR		0x06		/* Interrupt Control Register */
+#define IMX2_WDT_WICR_WIE	BIT(15)		/* -> Interrupt Enable */
+#define IMX2_WDT_WICR_WTIS	BIT(14)		/* -> Interrupt Status */
+#define IMX2_WDT_WICR_WICT	0xFF		/* -> Interrupt Count Timeout */
 
 #define IMX2_WDT_WMCR		0x08		/* Misc Register */
 
@@ -80,6 +86,12 @@ static const struct watchdog_info imx2_wdt_info = {
 	.options = WDIOF_KEEPALIVEPING | WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE,
 };
 
+static const struct watchdog_info imx2_wdt_pretimeout_info = {
+	.identity = "imx2+ watchdog",
+	.options = WDIOF_KEEPALIVEPING | WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE |
+		   WDIOF_PRETIMEOUT,
+};
+
 static int imx2_wdt_restart(struct watchdog_device *wdog, unsigned long action,
 			    void *data)
 {
@@ -169,6 +181,35 @@ static int imx2_wdt_set_timeout(struct watchdog_device *wdog,
 	return 0;
 }
 
+static int imx2_wdt_set_pretimeout(struct watchdog_device *wdog,
+				   unsigned int new_pretimeout)
+{
+	struct imx2_wdt_device *wdev = watchdog_get_drvdata(wdog);
+
+	if (new_pretimeout >= IMX2_WDT_MAX_TIME)
+		return -EINVAL;
+
+	wdog->pretimeout = new_pretimeout;
+
+	regmap_update_bits(wdev->regmap, IMX2_WDT_WICR,
+			   IMX2_WDT_WICR_WIE | IMX2_WDT_WICR_WICT,
+			   IMX2_WDT_WICR_WIE | (new_pretimeout << 1));
+	return 0;
+}
+
+static irqreturn_t imx2_wdt_isr(int irq, void *wdog_arg)
+{
+	struct watchdog_device *wdog = wdog_arg;
+	struct imx2_wdt_device *wdev = watchdog_get_drvdata(wdog);
+
+	regmap_write_bits(wdev->regmap, IMX2_WDT_WICR,
+			  IMX2_WDT_WICR_WTIS, IMX2_WDT_WICR_WTIS);
+
+	watchdog_notify_pretimeout(wdog);
+
+	return IRQ_HANDLED;
+}
+
 static int imx2_wdt_start(struct watchdog_device *wdog)
 {
 	struct imx2_wdt_device *wdev = watchdog_get_drvdata(wdog);
@@ -188,6 +229,7 @@ static const struct watchdog_ops imx2_wdt_ops = {
 	.start = imx2_wdt_start,
 	.ping = imx2_wdt_ping,
 	.set_timeout = imx2_wdt_set_timeout,
+	.set_pretimeout = imx2_wdt_set_pretimeout,
 	.restart = imx2_wdt_restart,
 };
 
@@ -236,6 +278,12 @@ static int __init imx2_wdt_probe(struct platform_device *pdev)
 	wdog->max_hw_heartbeat_ms = IMX2_WDT_MAX_TIME * 1000;
 	wdog->parent		= &pdev->dev;
 
+	ret = platform_get_irq(pdev, 0);
+	if (ret > 0)
+		if (!devm_request_irq(&pdev->dev, ret, imx2_wdt_isr, 0,
+				      dev_name(&pdev->dev), wdog))
+			wdog->info = &imx2_wdt_pretimeout_info;
+
 	ret = clk_prepare_enable(wdev->clk);
 	if (ret)
 		return ret;
diff --git a/drivers/watchdog/kempld_wdt.c b/drivers/watchdog/kempld_wdt.c
index 5bf931ce1353..8e302d0e346c 100644
--- a/drivers/watchdog/kempld_wdt.c
+++ b/drivers/watchdog/kempld_wdt.c
@@ -430,7 +430,7 @@ static struct watchdog_info kempld_wdt_info = {
 			WDIOF_PRETIMEOUT
 };
 
-static struct watchdog_ops kempld_wdt_ops = {
+static const struct watchdog_ops kempld_wdt_ops = {
 	.owner		= THIS_MODULE,
 	.start		= kempld_wdt_start,
 	.stop		= kempld_wdt_stop,
diff --git a/drivers/watchdog/mt7621_wdt.c b/drivers/watchdog/mt7621_wdt.c
index 4a2290f900a8..d5735c12067d 100644
--- a/drivers/watchdog/mt7621_wdt.c
+++ b/drivers/watchdog/mt7621_wdt.c
@@ -139,7 +139,6 @@ static int mt7621_wdt_probe(struct platform_device *pdev)
 	if (!IS_ERR(mt7621_wdt_reset))
 		reset_control_deassert(mt7621_wdt_reset);
 
-	mt7621_wdt_dev.dev = &pdev->dev;
 	mt7621_wdt_dev.bootstatus = mt7621_wdt_bootcause();
 
 	watchdog_init_timeout(&mt7621_wdt_dev, mt7621_wdt_dev.max_timeout,
diff --git a/drivers/watchdog/of_xilinx_wdt.c b/drivers/watchdog/of_xilinx_wdt.c
index b2e1b4cbbdc1..fae7fe929ea3 100644
--- a/drivers/watchdog/of_xilinx_wdt.c
+++ b/drivers/watchdog/of_xilinx_wdt.c
@@ -10,6 +10,7 @@
  * 2 of the License, or (at your option) any later version.
  */
 
+#include <linux/clk.h>
 #include <linux/err.h>
 #include <linux/module.h>
 #include <linux/types.h>
@@ -45,6 +46,7 @@ struct xwdt_device {
 	u32 wdt_interval;
 	spinlock_t spinlock;
 	struct watchdog_device xilinx_wdt_wdd;
+	struct clk		*clk;
 };
 
 static int xilinx_wdt_start(struct watchdog_device *wdd)
@@ -195,16 +197,30 @@ static int xwdt_probe(struct platform_device *pdev)
 	spin_lock_init(&xdev->spinlock);
 	watchdog_set_drvdata(xilinx_wdt_wdd, xdev);
 
+	xdev->clk = devm_clk_get(&pdev->dev, NULL);
+	if (IS_ERR(xdev->clk)) {
+		if (PTR_ERR(xdev->clk) == -ENOENT)
+			xdev->clk = NULL;
+		else
+			return PTR_ERR(xdev->clk);
+	}
+
+	rc = clk_prepare_enable(xdev->clk);
+	if (rc) {
+		dev_err(&pdev->dev, "unable to enable clock\n");
+		return rc;
+	}
+
 	rc = xwdt_selftest(xdev);
 	if (rc == XWT_TIMER_FAILED) {
 		dev_err(&pdev->dev, "SelfTest routine error\n");
-		return rc;
+		goto err_clk_disable;
 	}
 
 	rc = watchdog_register_device(xilinx_wdt_wdd);
 	if (rc) {
 		dev_err(&pdev->dev, "Cannot register watchdog (err=%d)\n", rc);
-		return rc;
+		goto err_clk_disable;
 	}
 
 	dev_info(&pdev->dev, "Xilinx Watchdog Timer at %p with timeout %ds\n",
@@ -213,6 +229,10 @@ static int xwdt_probe(struct platform_device *pdev)
 	platform_set_drvdata(pdev, xdev);
 
 	return 0;
+err_clk_disable:
+	clk_disable_unprepare(xdev->clk);
+
+	return rc;
 }
 
 static int xwdt_remove(struct platform_device *pdev)
@@ -220,6 +240,7 @@ static int xwdt_remove(struct platform_device *pdev)
 	struct xwdt_device *xdev = platform_get_drvdata(pdev);
 
 	watchdog_unregister_device(&xdev->xilinx_wdt_wdd);
+	clk_disable_unprepare(xdev->clk);
 
 	return 0;
 }
diff --git a/drivers/watchdog/pretimeout_noop.c b/drivers/watchdog/pretimeout_noop.c
new file mode 100644
index 000000000000..85f5299d197c
--- /dev/null
+++ b/drivers/watchdog/pretimeout_noop.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2015-2016 Mentor Graphics
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/module.h>
+#include <linux/printk.h>
+#include <linux/watchdog.h>
+
+#include "watchdog_pretimeout.h"
+
+/**
+ * pretimeout_noop - No operation on watchdog pretimeout event
+ * @wdd - watchdog_device
+ *
+ * This function prints a message about pretimeout to kernel log.
+ */
+static void pretimeout_noop(struct watchdog_device *wdd)
+{
+	pr_alert("watchdog%d: pretimeout event\n", wdd->id);
+}
+
+static struct watchdog_governor watchdog_gov_noop = {
+	.name		= "noop",
+	.pretimeout	= pretimeout_noop,
+};
+
+static int __init watchdog_gov_noop_register(void)
+{
+	return watchdog_register_governor(&watchdog_gov_noop);
+}
+
+static void __exit watchdog_gov_noop_unregister(void)
+{
+	watchdog_unregister_governor(&watchdog_gov_noop);
+}
+module_init(watchdog_gov_noop_register);
+module_exit(watchdog_gov_noop_unregister);
+
+MODULE_AUTHOR("Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>");
+MODULE_DESCRIPTION("Panic watchdog pretimeout governor");
+MODULE_LICENSE("GPL");
diff --git a/drivers/watchdog/pretimeout_panic.c b/drivers/watchdog/pretimeout_panic.c
new file mode 100644
index 000000000000..0c197a1c97f4
--- /dev/null
+++ b/drivers/watchdog/pretimeout_panic.c
@@ -0,0 +1,47 @@
+/*
+ * Copyright (C) 2015-2016 Mentor Graphics
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/watchdog.h>
+
+#include "watchdog_pretimeout.h"
+
+/**
+ * pretimeout_panic - Panic on watchdog pretimeout event
+ * @wdd - watchdog_device
+ *
+ * Panic, watchdog has not been fed till pretimeout event.
+ */
+static void pretimeout_panic(struct watchdog_device *wdd)
+{
+	panic("watchdog pretimeout event\n");
+}
+
+static struct watchdog_governor watchdog_gov_panic = {
+	.name		= "panic",
+	.pretimeout	= pretimeout_panic,
+};
+
+static int __init watchdog_gov_panic_register(void)
+{
+	return watchdog_register_governor(&watchdog_gov_panic);
+}
+
+static void __exit watchdog_gov_panic_unregister(void)
+{
+	watchdog_unregister_governor(&watchdog_gov_panic);
+}
+module_init(watchdog_gov_panic_register);
+module_exit(watchdog_gov_panic_unregister);
+
+MODULE_AUTHOR("Vladimir Zapolskiy <vladimir_zapolskiy@mentor.com>");
+MODULE_DESCRIPTION("Panic watchdog pretimeout governor");
+MODULE_LICENSE("GPL");
diff --git a/drivers/watchdog/rn5t618_wdt.c b/drivers/watchdog/rn5t618_wdt.c
index d1c12278cb6a..0805ee2acd7a 100644
--- a/drivers/watchdog/rn5t618_wdt.c
+++ b/drivers/watchdog/rn5t618_wdt.c
@@ -136,7 +136,7 @@ static struct watchdog_info rn5t618_wdt_info = {
 	.identity	= DRIVER_NAME,
 };
 
-static struct watchdog_ops rn5t618_wdt_ops = {
+static const struct watchdog_ops rn5t618_wdt_ops = {
 	.owner          = THIS_MODULE,
 	.start          = rn5t618_wdt_start,
 	.stop           = rn5t618_wdt_stop,
diff --git a/drivers/watchdog/rt2880_wdt.c b/drivers/watchdog/rt2880_wdt.c
index 1967919ae743..14b4fd428fff 100644
--- a/drivers/watchdog/rt2880_wdt.c
+++ b/drivers/watchdog/rt2880_wdt.c
@@ -158,7 +158,6 @@ static int rt288x_wdt_probe(struct platform_device *pdev)
 
 	rt288x_wdt_freq = clk_get_rate(rt288x_wdt_clk) / RALINK_WDT_PRESCALE;
 
-	rt288x_wdt_dev.dev = &pdev->dev;
 	rt288x_wdt_dev.bootstatus = rt288x_wdt_bootcause();
 	rt288x_wdt_dev.max_timeout = (0xfffful / rt288x_wdt_freq);
 	rt288x_wdt_dev.parent = &pdev->dev;
diff --git a/drivers/watchdog/softdog.c b/drivers/watchdog/softdog.c
index b067edf246df..c7bdc986dca1 100644
--- a/drivers/watchdog/softdog.c
+++ b/drivers/watchdog/softdog.c
@@ -72,10 +72,27 @@ static void softdog_fire(unsigned long data)
 static struct timer_list softdog_ticktock =
 		TIMER_INITIALIZER(softdog_fire, 0, 0);
 
+static struct watchdog_device softdog_dev;
+
+static void softdog_pretimeout(unsigned long data)
+{
+	watchdog_notify_pretimeout(&softdog_dev);
+}
+
+static struct timer_list softdog_preticktock =
+		TIMER_INITIALIZER(softdog_pretimeout, 0, 0);
+
 static int softdog_ping(struct watchdog_device *w)
 {
 	if (!mod_timer(&softdog_ticktock, jiffies + (w->timeout * HZ)))
 		__module_get(THIS_MODULE);
+
+	if (w->pretimeout)
+		mod_timer(&softdog_preticktock, jiffies +
+			  (w->timeout - w->pretimeout) * HZ);
+	else
+		del_timer(&softdog_preticktock);
+
 	return 0;
 }
 
@@ -84,15 +101,18 @@ static int softdog_stop(struct watchdog_device *w)
 	if (del_timer(&softdog_ticktock))
 		module_put(THIS_MODULE);
 
+	del_timer(&softdog_preticktock);
+
 	return 0;
 }
 
 static struct watchdog_info softdog_info = {
 	.identity = "Software Watchdog",
-	.options = WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING | WDIOF_MAGICCLOSE,
+	.options = WDIOF_SETTIMEOUT | WDIOF_KEEPALIVEPING | WDIOF_MAGICCLOSE |
+		   WDIOF_PRETIMEOUT,
 };
 
-static struct watchdog_ops softdog_ops = {
+static const struct watchdog_ops softdog_ops = {
 	.owner = THIS_MODULE,
 	.start = softdog_ping,
 	.stop = softdog_stop,
diff --git a/drivers/watchdog/st_lpc_wdt.c b/drivers/watchdog/st_lpc_wdt.c
index 14e9badf2bfa..e6100e447dd8 100644
--- a/drivers/watchdog/st_lpc_wdt.c
+++ b/drivers/watchdog/st_lpc_wdt.c
@@ -52,27 +52,6 @@ struct st_wdog {
 	bool warm_reset;
 };
 
-static struct st_wdog_syscfg stid127_syscfg = {
-	.reset_type_reg		= 0x004,
-	.reset_type_mask	= BIT(2),
-	.enable_reg		= 0x000,
-	.enable_mask		= BIT(2),
-};
-
-static struct st_wdog_syscfg stih415_syscfg = {
-	.reset_type_reg		= 0x0B8,
-	.reset_type_mask	= BIT(6),
-	.enable_reg		= 0x0B4,
-	.enable_mask		= BIT(7),
-};
-
-static struct st_wdog_syscfg stih416_syscfg = {
-	.reset_type_reg		= 0x88C,
-	.reset_type_mask	= BIT(6),
-	.enable_reg		= 0x888,
-	.enable_mask		= BIT(7),
-};
-
 static struct st_wdog_syscfg stih407_syscfg = {
 	.enable_reg		= 0x204,
 	.enable_mask		= BIT(19),
@@ -83,18 +62,6 @@ static const struct of_device_id st_wdog_match[] = {
 		.compatible = "st,stih407-lpc",
 		.data = &stih407_syscfg,
 	},
-	{
-		.compatible = "st,stih416-lpc",
-		.data = &stih416_syscfg,
-	},
-	{
-		.compatible = "st,stih415-lpc",
-		.data = &stih415_syscfg,
-	},
-	{
-		.compatible = "st,stid127-lpc",
-		.data = &stid127_syscfg,
-	},
 	{},
 };
 MODULE_DEVICE_TABLE(of, st_wdog_match);
diff --git a/drivers/watchdog/tegra_wdt.c b/drivers/watchdog/tegra_wdt.c
index 9ec57608da82..2d53c3f9394f 100644
--- a/drivers/watchdog/tegra_wdt.c
+++ b/drivers/watchdog/tegra_wdt.c
@@ -178,7 +178,7 @@ static const struct watchdog_info tegra_wdt_info = {
 	.identity	= "Tegra Watchdog",
 };
 
-static struct watchdog_ops tegra_wdt_ops = {
+static const struct watchdog_ops tegra_wdt_ops = {
 	.owner = THIS_MODULE,
 	.start = tegra_wdt_start,
 	.stop = tegra_wdt_stop,
diff --git a/drivers/watchdog/txx9wdt.c b/drivers/watchdog/txx9wdt.c
index c2da880292bc..6f7a9deb27d0 100644
--- a/drivers/watchdog/txx9wdt.c
+++ b/drivers/watchdog/txx9wdt.c
@@ -112,7 +112,7 @@ static int __init txx9wdt_probe(struct platform_device *dev)
 		txx9_imclk = NULL;
 		goto exit;
 	}
-	ret = clk_enable(txx9_imclk);
+	ret = clk_prepare_enable(txx9_imclk);
 	if (ret) {
 		clk_put(txx9_imclk);
 		txx9_imclk = NULL;
@@ -144,7 +144,7 @@ static int __init txx9wdt_probe(struct platform_device *dev)
 	return 0;
 exit:
 	if (txx9_imclk) {
-		clk_disable(txx9_imclk);
+		clk_disable_unprepare(txx9_imclk);
 		clk_put(txx9_imclk);
 	}
 	return ret;
@@ -153,7 +153,7 @@ exit:
 static int __exit txx9wdt_remove(struct platform_device *dev)
 {
 	watchdog_unregister_device(&txx9wdt);
-	clk_disable(txx9_imclk);
+	clk_disable_unprepare(txx9_imclk);
 	clk_put(txx9_imclk);
 	return 0;
 }
diff --git a/drivers/watchdog/w83627hf_wdt.c b/drivers/watchdog/w83627hf_wdt.c
index 09e8003039dc..ef2ecaf53a14 100644
--- a/drivers/watchdog/w83627hf_wdt.c
+++ b/drivers/watchdog/w83627hf_wdt.c
@@ -302,7 +302,7 @@ static struct watchdog_info wdt_info = {
 	.identity = "W83627HF Watchdog",
 };
 
-static struct watchdog_ops wdt_ops = {
+static const struct watchdog_ops wdt_ops = {
 	.owner = THIS_MODULE,
 	.start = wdt_start,
 	.stop = wdt_stop,
diff --git a/drivers/watchdog/watchdog_core.c b/drivers/watchdog/watchdog_core.c
index 6abb83cd7681..74265b2f806c 100644
--- a/drivers/watchdog/watchdog_core.c
+++ b/drivers/watchdog/watchdog_core.c
@@ -349,7 +349,7 @@ int devm_watchdog_register_device(struct device *dev,
 	struct watchdog_device **rcwdd;
 	int ret;
 
-	rcwdd = devres_alloc(devm_watchdog_unregister_device, sizeof(*wdd),
+	rcwdd = devres_alloc(devm_watchdog_unregister_device, sizeof(*rcwdd),
 			     GFP_KERNEL);
 	if (!rcwdd)
 		return -ENOMEM;
diff --git a/drivers/watchdog/watchdog_dev.c b/drivers/watchdog/watchdog_dev.c
index 040bf8382f46..32930a073a12 100644
--- a/drivers/watchdog/watchdog_dev.c
+++ b/drivers/watchdog/watchdog_dev.c
@@ -49,6 +49,7 @@
 #include <linux/uaccess.h>	/* For copy_to_user/put_user/... */
 
 #include "watchdog_core.h"
+#include "watchdog_pretimeout.h"
 
 /*
  * struct watchdog_core_data - watchdog core internal data
@@ -335,10 +336,14 @@ static int watchdog_set_timeout(struct watchdog_device *wdd,
 	if (watchdog_timeout_invalid(wdd, timeout))
 		return -EINVAL;
 
-	if (wdd->ops->set_timeout)
+	if (wdd->ops->set_timeout) {
 		err = wdd->ops->set_timeout(wdd, timeout);
-	else
+	} else {
 		wdd->timeout = timeout;
+		/* Disable pretimeout if it doesn't fit the new timeout */
+		if (wdd->pretimeout >= wdd->timeout)
+			wdd->pretimeout = 0;
+	}
 
 	watchdog_update_worker(wdd);
 
@@ -346,6 +351,31 @@ static int watchdog_set_timeout(struct watchdog_device *wdd,
 }
 
 /*
+ *	watchdog_set_pretimeout: set the watchdog timer pretimeout
+ *	@wdd: the watchdog device to set the timeout for
+ *	@timeout: pretimeout to set in seconds
+ */
+
+static int watchdog_set_pretimeout(struct watchdog_device *wdd,
+				   unsigned int timeout)
+{
+	int err = 0;
+
+	if (!(wdd->info->options & WDIOF_PRETIMEOUT))
+		return -EOPNOTSUPP;
+
+	if (watchdog_pretimeout_invalid(wdd, timeout))
+		return -EINVAL;
+
+	if (wdd->ops->set_pretimeout)
+		err = wdd->ops->set_pretimeout(wdd, timeout);
+	else
+		wdd->pretimeout = timeout;
+
+	return err;
+}
+
+/*
  *	watchdog_get_timeleft: wrapper to get the time left before a reboot
  *	@wdd: the watchdog device to get the remaining time from
  *	@timeleft: the time that's left
@@ -429,6 +459,15 @@ static ssize_t timeout_show(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RO(timeout);
 
+static ssize_t pretimeout_show(struct device *dev,
+			       struct device_attribute *attr, char *buf)
+{
+	struct watchdog_device *wdd = dev_get_drvdata(dev);
+
+	return sprintf(buf, "%u\n", wdd->pretimeout);
+}
+static DEVICE_ATTR_RO(pretimeout);
+
 static ssize_t identity_show(struct device *dev, struct device_attribute *attr,
 				char *buf)
 {
@@ -450,6 +489,36 @@ static ssize_t state_show(struct device *dev, struct device_attribute *attr,
 }
 static DEVICE_ATTR_RO(state);
 
+static ssize_t pretimeout_available_governors_show(struct device *dev,
+				   struct device_attribute *attr, char *buf)
+{
+	return watchdog_pretimeout_available_governors_get(buf);
+}
+static DEVICE_ATTR_RO(pretimeout_available_governors);
+
+static ssize_t pretimeout_governor_show(struct device *dev,
+					struct device_attribute *attr,
+					char *buf)
+{
+	struct watchdog_device *wdd = dev_get_drvdata(dev);
+
+	return watchdog_pretimeout_governor_get(wdd, buf);
+}
+
+static ssize_t pretimeout_governor_store(struct device *dev,
+					 struct device_attribute *attr,
+					 const char *buf, size_t count)
+{
+	struct watchdog_device *wdd = dev_get_drvdata(dev);
+	int ret = watchdog_pretimeout_governor_set(wdd, buf);
+
+	if (!ret)
+		ret = count;
+
+	return ret;
+}
+static DEVICE_ATTR_RW(pretimeout_governor);
+
 static umode_t wdt_is_visible(struct kobject *kobj, struct attribute *attr,
 				int n)
 {
@@ -459,6 +528,14 @@ static umode_t wdt_is_visible(struct kobject *kobj, struct attribute *attr,
 
 	if (attr == &dev_attr_timeleft.attr && !wdd->ops->get_timeleft)
 		mode = 0;
+	else if (attr == &dev_attr_pretimeout.attr &&
+		 !(wdd->info->options & WDIOF_PRETIMEOUT))
+		mode = 0;
+	else if ((attr == &dev_attr_pretimeout_governor.attr ||
+		  attr == &dev_attr_pretimeout_available_governors.attr) &&
+		 (!(wdd->info->options & WDIOF_PRETIMEOUT) ||
+		  !IS_ENABLED(CONFIG_WATCHDOG_PRETIMEOUT_GOV)))
+		mode = 0;
 
 	return mode;
 }
@@ -466,10 +543,13 @@ static struct attribute *wdt_attrs[] = {
 	&dev_attr_state.attr,
 	&dev_attr_identity.attr,
 	&dev_attr_timeout.attr,
+	&dev_attr_pretimeout.attr,
 	&dev_attr_timeleft.attr,
 	&dev_attr_bootstatus.attr,
 	&dev_attr_status.attr,
 	&dev_attr_nowayout.attr,
+	&dev_attr_pretimeout_governor.attr,
+	&dev_attr_pretimeout_available_governors.attr,
 	NULL,
 };
 
@@ -646,6 +726,16 @@ static long watchdog_ioctl(struct file *file, unsigned int cmd,
 			break;
 		err = put_user(val, p);
 		break;
+	case WDIOC_SETPRETIMEOUT:
+		if (get_user(val, p)) {
+			err = -EFAULT;
+			break;
+		}
+		err = watchdog_set_pretimeout(wdd, val);
+		break;
+	case WDIOC_GETPRETIMEOUT:
+		err = put_user(wdd->pretimeout, p);
+		break;
 	default:
 		err = -ENOTTY;
 		break;
@@ -937,6 +1027,12 @@ int watchdog_dev_register(struct watchdog_device *wdd)
 		return PTR_ERR(dev);
 	}
 
+	ret = watchdog_register_pretimeout(wdd);
+	if (ret) {
+		device_destroy(&watchdog_class, devno);
+		watchdog_cdev_unregister(wdd);
+	}
+
 	return ret;
 }
 
@@ -950,6 +1046,7 @@ int watchdog_dev_register(struct watchdog_device *wdd)
 
 void watchdog_dev_unregister(struct watchdog_device *wdd)
 {
+	watchdog_unregister_pretimeout(wdd);
 	device_destroy(&watchdog_class, wdd->wd_data->cdev.dev);
 	watchdog_cdev_unregister(wdd);
 }
diff --git a/drivers/watchdog/watchdog_pretimeout.c b/drivers/watchdog/watchdog_pretimeout.c
new file mode 100644
index 000000000000..9db07bfb4334
--- /dev/null
+++ b/drivers/watchdog/watchdog_pretimeout.c
@@ -0,0 +1,220 @@
+/*
+ * Copyright (C) 2015-2016 Mentor Graphics
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ *
+ */
+
+#include <linux/list.h>
+#include <linux/slab.h>
+#include <linux/spinlock.h>
+#include <linux/string.h>
+#include <linux/watchdog.h>
+
+#include "watchdog_pretimeout.h"
+
+/* Default watchdog pretimeout governor */
+static struct watchdog_governor *default_gov;
+
+/* The spinlock protects default_gov, wdd->gov and pretimeout_list */
+static DEFINE_SPINLOCK(pretimeout_lock);
+
+/* List of watchdog devices, which can generate a pretimeout event */
+static LIST_HEAD(pretimeout_list);
+
+struct watchdog_pretimeout {
+	struct watchdog_device		*wdd;
+	struct list_head		entry;
+};
+
+/* The mutex protects governor list and serializes external interfaces */
+static DEFINE_MUTEX(governor_lock);
+
+/* List of the registered watchdog pretimeout governors */
+static LIST_HEAD(governor_list);
+
+struct governor_priv {
+	struct watchdog_governor	*gov;
+	struct list_head		entry;
+};
+
+static struct governor_priv *find_governor_by_name(const char *gov_name)
+{
+	struct governor_priv *priv;
+
+	list_for_each_entry(priv, &governor_list, entry)
+		if (sysfs_streq(gov_name, priv->gov->name))
+			return priv;
+
+	return NULL;
+}
+
+int watchdog_pretimeout_available_governors_get(char *buf)
+{
+	struct governor_priv *priv;
+	int count = 0;
+
+	mutex_lock(&governor_lock);
+
+	list_for_each_entry(priv, &governor_list, entry)
+		count += sprintf(buf + count, "%s\n", priv->gov->name);
+
+	mutex_unlock(&governor_lock);
+
+	return count;
+}
+
+int watchdog_pretimeout_governor_get(struct watchdog_device *wdd, char *buf)
+{
+	int count = 0;
+
+	spin_lock_irq(&pretimeout_lock);
+	if (wdd->gov)
+		count = sprintf(buf, "%s\n", wdd->gov->name);
+	spin_unlock_irq(&pretimeout_lock);
+
+	return count;
+}
+
+int watchdog_pretimeout_governor_set(struct watchdog_device *wdd,
+				     const char *buf)
+{
+	struct governor_priv *priv;
+
+	mutex_lock(&governor_lock);
+
+	priv = find_governor_by_name(buf);
+	if (!priv) {
+		mutex_unlock(&governor_lock);
+		return -EINVAL;
+	}
+
+	spin_lock_irq(&pretimeout_lock);
+	wdd->gov = priv->gov;
+	spin_unlock_irq(&pretimeout_lock);
+
+	mutex_unlock(&governor_lock);
+
+	return 0;
+}
+
+void watchdog_notify_pretimeout(struct watchdog_device *wdd)
+{
+	unsigned long flags;
+
+	spin_lock_irqsave(&pretimeout_lock, flags);
+	if (!wdd->gov) {
+		spin_unlock_irqrestore(&pretimeout_lock, flags);
+		return;
+	}
+
+	wdd->gov->pretimeout(wdd);
+	spin_unlock_irqrestore(&pretimeout_lock, flags);
+}
+EXPORT_SYMBOL_GPL(watchdog_notify_pretimeout);
+
+int watchdog_register_governor(struct watchdog_governor *gov)
+{
+	struct watchdog_pretimeout *p;
+	struct governor_priv *priv;
+
+	priv = kzalloc(sizeof(*priv), GFP_KERNEL);
+	if (!priv)
+		return -ENOMEM;
+
+	mutex_lock(&governor_lock);
+
+	if (find_governor_by_name(gov->name)) {
+		mutex_unlock(&governor_lock);
+		kfree(priv);
+		return -EBUSY;
+	}
+
+	priv->gov = gov;
+	list_add(&priv->entry, &governor_list);
+
+	if (!strncmp(gov->name, WATCHDOG_PRETIMEOUT_DEFAULT_GOV,
+		     WATCHDOG_GOV_NAME_MAXLEN)) {
+		spin_lock_irq(&pretimeout_lock);
+		default_gov = gov;
+
+		list_for_each_entry(p, &pretimeout_list, entry)
+			if (!p->wdd->gov)
+				p->wdd->gov = default_gov;
+		spin_unlock_irq(&pretimeout_lock);
+	}
+
+	mutex_unlock(&governor_lock);
+
+	return 0;
+}
+EXPORT_SYMBOL(watchdog_register_governor);
+
+void watchdog_unregister_governor(struct watchdog_governor *gov)
+{
+	struct watchdog_pretimeout *p;
+	struct governor_priv *priv, *t;
+
+	mutex_lock(&governor_lock);
+
+	list_for_each_entry_safe(priv, t, &governor_list, entry) {
+		if (priv->gov == gov) {
+			list_del(&priv->entry);
+			kfree(priv);
+			break;
+		}
+	}
+
+	spin_lock_irq(&pretimeout_lock);
+	list_for_each_entry(p, &pretimeout_list, entry)
+		if (p->wdd->gov == gov)
+			p->wdd->gov = default_gov;
+	spin_unlock_irq(&pretimeout_lock);
+
+	mutex_unlock(&governor_lock);
+}
+EXPORT_SYMBOL(watchdog_unregister_governor);
+
+int watchdog_register_pretimeout(struct watchdog_device *wdd)
+{
+	struct watchdog_pretimeout *p;
+
+	if (!(wdd->info->options & WDIOF_PRETIMEOUT))
+		return 0;
+
+	p = kzalloc(sizeof(*p), GFP_KERNEL);
+	if (!p)
+		return -ENOMEM;
+
+	spin_lock_irq(&pretimeout_lock);
+	list_add(&p->entry, &pretimeout_list);
+	p->wdd = wdd;
+	wdd->gov = default_gov;
+	spin_unlock_irq(&pretimeout_lock);
+
+	return 0;
+}
+
+void watchdog_unregister_pretimeout(struct watchdog_device *wdd)
+{
+	struct watchdog_pretimeout *p, *t;
+
+	if (!(wdd->info->options & WDIOF_PRETIMEOUT))
+		return;
+
+	spin_lock_irq(&pretimeout_lock);
+	wdd->gov = NULL;
+
+	list_for_each_entry_safe(p, t, &pretimeout_list, entry) {
+		if (p->wdd == wdd) {
+			list_del(&p->entry);
+			break;
+		}
+	}
+	spin_unlock_irq(&pretimeout_lock);
+
+	kfree(p);
+}
diff --git a/drivers/watchdog/watchdog_pretimeout.h b/drivers/watchdog/watchdog_pretimeout.h
new file mode 100644
index 000000000000..a5a32b39c56d
--- /dev/null
+++ b/drivers/watchdog/watchdog_pretimeout.h
@@ -0,0 +1,60 @@
+#ifndef __WATCHDOG_PRETIMEOUT_H
+#define __WATCHDOG_PRETIMEOUT_H
+
+#define WATCHDOG_GOV_NAME_MAXLEN	20
+
+struct watchdog_device;
+
+struct watchdog_governor {
+	const char	name[WATCHDOG_GOV_NAME_MAXLEN];
+	void		(*pretimeout)(struct watchdog_device *wdd);
+};
+
+#if IS_ENABLED(CONFIG_WATCHDOG_PRETIMEOUT_GOV)
+/* Interfaces to watchdog pretimeout governors */
+int watchdog_register_governor(struct watchdog_governor *gov);
+void watchdog_unregister_governor(struct watchdog_governor *gov);
+
+/* Interfaces to watchdog_dev.c */
+int watchdog_register_pretimeout(struct watchdog_device *wdd);
+void watchdog_unregister_pretimeout(struct watchdog_device *wdd);
+int watchdog_pretimeout_available_governors_get(char *buf);
+int watchdog_pretimeout_governor_get(struct watchdog_device *wdd, char *buf);
+int watchdog_pretimeout_governor_set(struct watchdog_device *wdd,
+				     const char *buf);
+
+#if IS_ENABLED(CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP)
+#define WATCHDOG_PRETIMEOUT_DEFAULT_GOV		"noop"
+#elif IS_ENABLED(CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC)
+#define WATCHDOG_PRETIMEOUT_DEFAULT_GOV		"panic"
+#endif
+
+#else
+static inline int watchdog_register_pretimeout(struct watchdog_device *wdd)
+{
+	return 0;
+}
+
+static inline void watchdog_unregister_pretimeout(struct watchdog_device *wdd)
+{
+}
+
+static inline int watchdog_pretimeout_available_governors_get(char *buf)
+{
+	return -EINVAL;
+}
+
+static inline int watchdog_pretimeout_governor_get(struct watchdog_device *wdd,
+						   char *buf)
+{
+	return -EINVAL;
+}
+
+static inline int watchdog_pretimeout_governor_set(struct watchdog_device *wdd,
+						   const char *buf)
+{
+	return -EINVAL;
+}
+#endif
+
+#endif
diff --git a/drivers/watchdog/ziirave_wdt.c b/drivers/watchdog/ziirave_wdt.c
index fa1efef3c96e..b4e0cea5a64e 100644
--- a/drivers/watchdog/ziirave_wdt.c
+++ b/drivers/watchdog/ziirave_wdt.c
@@ -18,7 +18,10 @@
  * GNU General Public License for more details.
  */
 
+#include <linux/delay.h>
 #include <linux/i2c.h>
+#include <linux/ihex.h>
+#include <linux/firmware.h>
 #include <linux/kernel.h>
 #include <linux/module.h>
 #include <linux/slab.h>
@@ -36,6 +39,8 @@
 #define ZIIRAVE_STATE_OFF	0x1
 #define ZIIRAVE_STATE_ON	0x2
 
+#define ZIIRAVE_FW_NAME		"ziirave_wdt.fw"
+
 static char *ziirave_reasons[] = {"power cycle", "hw watchdog", NULL, NULL,
 				  "host request", NULL, "illegal configuration",
 				  "illegal instruction", "illegal trap",
@@ -50,12 +55,35 @@ static char *ziirave_reasons[] = {"power cycle", "hw watchdog", NULL, NULL,
 #define ZIIRAVE_WDT_PING		0x9
 #define ZIIRAVE_WDT_RESET_DURATION	0xa
 
+#define ZIIRAVE_FIRM_PKT_TOTAL_SIZE	20
+#define ZIIRAVE_FIRM_PKT_DATA_SIZE	16
+#define ZIIRAVE_FIRM_FLASH_MEMORY_START	0x1600
+#define ZIIRAVE_FIRM_FLASH_MEMORY_END	0x2bbf
+
+/* Received and ready for next Download packet. */
+#define ZIIRAVE_FIRM_DOWNLOAD_ACK	1
+/* Currently writing to flash. Retry Download status in a moment! */
+#define ZIIRAVE_FIRM_DOWNLOAD_BUSY	2
+
+/* Wait for ACK timeout in ms */
+#define ZIIRAVE_FIRM_WAIT_FOR_ACK_TIMEOUT	50
+
+/* Firmware commands */
+#define ZIIRAVE_CMD_DOWNLOAD_START		0x10
+#define ZIIRAVE_CMD_DOWNLOAD_END		0x11
+#define ZIIRAVE_CMD_DOWNLOAD_SET_READ_ADDR	0x12
+#define ZIIRAVE_CMD_DOWNLOAD_READ_BYTE		0x13
+#define ZIIRAVE_CMD_RESET_PROCESSOR		0x0b
+#define ZIIRAVE_CMD_JUMP_TO_BOOTLOADER		0x0c
+#define ZIIRAVE_CMD_DOWNLOAD_PACKET		0x0e
+
 struct ziirave_wdt_rev {
 	unsigned char major;
 	unsigned char minor;
 };
 
 struct ziirave_wdt_data {
+	struct mutex sysfs_mutex;
 	struct watchdog_device wdd;
 	struct ziirave_wdt_rev bootloader_rev;
 	struct ziirave_wdt_rev firmware_rev;
@@ -146,6 +174,293 @@ static unsigned int ziirave_wdt_get_timeleft(struct watchdog_device *wdd)
 	return ret;
 }
 
+static int ziirave_firm_wait_for_ack(struct watchdog_device *wdd)
+{
+	struct i2c_client *client = to_i2c_client(wdd->parent);
+	int ret;
+	unsigned long timeout;
+
+	timeout = jiffies + msecs_to_jiffies(ZIIRAVE_FIRM_WAIT_FOR_ACK_TIMEOUT);
+	do {
+		if (time_after(jiffies, timeout))
+			return -ETIMEDOUT;
+
+		usleep_range(5000, 10000);
+
+		ret = i2c_smbus_read_byte(client);
+		if (ret < 0) {
+			dev_err(&client->dev, "Failed to read byte\n");
+			return ret;
+		}
+	} while (ret == ZIIRAVE_FIRM_DOWNLOAD_BUSY);
+
+	return ret == ZIIRAVE_FIRM_DOWNLOAD_ACK ? 0 : -EIO;
+}
+
+static int ziirave_firm_set_read_addr(struct watchdog_device *wdd, u16 addr)
+{
+	struct i2c_client *client = to_i2c_client(wdd->parent);
+	u8 address[2];
+
+	address[0] = addr & 0xff;
+	address[1] = (addr >> 8) & 0xff;
+
+	return i2c_smbus_write_block_data(client,
+					  ZIIRAVE_CMD_DOWNLOAD_SET_READ_ADDR,
+					  ARRAY_SIZE(address), address);
+}
+
+static int ziirave_firm_write_block_data(struct watchdog_device *wdd,
+					 u8 command, u8 length, const u8 *data,
+					 bool wait_for_ack)
+{
+	struct i2c_client *client = to_i2c_client(wdd->parent);
+	int ret;
+
+	ret = i2c_smbus_write_block_data(client, command, length, data);
+	if (ret) {
+		dev_err(&client->dev,
+			"Failed to send command 0x%02x: %d\n", command, ret);
+		return ret;
+	}
+
+	if (wait_for_ack)
+		ret = ziirave_firm_wait_for_ack(wdd);
+
+	return ret;
+}
+
+static int ziirave_firm_write_byte(struct watchdog_device *wdd, u8 command,
+				   u8 byte, bool wait_for_ack)
+{
+	return ziirave_firm_write_block_data(wdd, command, 1, &byte,
+					     wait_for_ack);
+}
+
+/*
+ * ziirave_firm_write_pkt() - Build and write a firmware packet
+ *
+ * A packet to send to the firmware is composed by following bytes:
+ *     Length | Addr0 | Addr1 | Data0 .. Data15 | Checksum |
+ * Where,
+ *     Length: A data byte containing the length of the data.
+ *     Addr0: Low byte of the address.
+ *     Addr1: High byte of the address.
+ *     Data0 .. Data15: Array of 16 bytes of data.
+ *     Checksum: Checksum byte to verify data integrity.
+ */
+static int ziirave_firm_write_pkt(struct watchdog_device *wdd,
+				  const struct ihex_binrec *rec)
+{
+	struct i2c_client *client = to_i2c_client(wdd->parent);
+	u8 i, checksum = 0, packet[ZIIRAVE_FIRM_PKT_TOTAL_SIZE];
+	int ret;
+	u16 addr;
+
+	memset(packet, 0, ARRAY_SIZE(packet));
+
+	/* Packet length */
+	packet[0] = (u8)be16_to_cpu(rec->len);
+	/* Packet address */
+	addr = (be32_to_cpu(rec->addr) & 0xffff) >> 1;
+	packet[1] = addr & 0xff;
+	packet[2] = (addr & 0xff00) >> 8;
+
+	/* Packet data */
+	if (be16_to_cpu(rec->len) > ZIIRAVE_FIRM_PKT_DATA_SIZE)
+		return -EMSGSIZE;
+	memcpy(packet + 3, rec->data, be16_to_cpu(rec->len));
+
+	/* Packet checksum */
+	for (i = 0; i < ZIIRAVE_FIRM_PKT_TOTAL_SIZE - 1; i++)
+		checksum += packet[i];
+	packet[ZIIRAVE_FIRM_PKT_TOTAL_SIZE - 1] = checksum;
+
+	ret = ziirave_firm_write_block_data(wdd, ZIIRAVE_CMD_DOWNLOAD_PACKET,
+					    ARRAY_SIZE(packet), packet, true);
+	if (ret)
+		dev_err(&client->dev,
+		      "Failed to write firmware packet at address 0x%04x: %d\n",
+		      addr, ret);
+
+	return ret;
+}
+
+static int ziirave_firm_verify(struct watchdog_device *wdd,
+			       const struct firmware *fw)
+{
+	struct i2c_client *client = to_i2c_client(wdd->parent);
+	const struct ihex_binrec *rec;
+	int i, ret;
+	u8 data[ZIIRAVE_FIRM_PKT_DATA_SIZE];
+	u16 addr;
+
+	for (rec = (void *)fw->data; rec; rec = ihex_next_binrec(rec)) {
+		/* Zero length marks end of records */
+		if (!be16_to_cpu(rec->len))
+			break;
+
+		addr = (be32_to_cpu(rec->addr) & 0xffff) >> 1;
+		if (addr < ZIIRAVE_FIRM_FLASH_MEMORY_START ||
+		    addr > ZIIRAVE_FIRM_FLASH_MEMORY_END)
+			continue;
+
+		ret = ziirave_firm_set_read_addr(wdd, addr);
+		if (ret) {
+			dev_err(&client->dev,
+				"Failed to send SET_READ_ADDR command: %d\n",
+				ret);
+			return ret;
+		}
+
+		for (i = 0; i < ARRAY_SIZE(data); i++) {
+			ret = i2c_smbus_read_byte_data(client,
+						ZIIRAVE_CMD_DOWNLOAD_READ_BYTE);
+			if (ret < 0) {
+				dev_err(&client->dev,
+					"Failed to READ DATA: %d\n", ret);
+				return ret;
+			}
+			data[i] = ret;
+		}
+
+		if (memcmp(data, rec->data, be16_to_cpu(rec->len))) {
+			dev_err(&client->dev,
+				"Firmware mismatch at address 0x%04x\n", addr);
+			return -EINVAL;
+		}
+	}
+
+	return 0;
+}
+
+static int ziirave_firm_upload(struct watchdog_device *wdd,
+			       const struct firmware *fw)
+{
+	struct i2c_client *client = to_i2c_client(wdd->parent);
+	int ret, words_till_page_break;
+	const struct ihex_binrec *rec;
+	struct ihex_binrec *rec_new;
+
+	ret = ziirave_firm_write_byte(wdd, ZIIRAVE_CMD_JUMP_TO_BOOTLOADER, 1,
+				      false);
+	if (ret)
+		return ret;
+
+	msleep(500);
+
+	ret = ziirave_firm_write_byte(wdd, ZIIRAVE_CMD_DOWNLOAD_START, 1, true);
+	if (ret)
+		return ret;
+
+	msleep(500);
+
+	for (rec = (void *)fw->data; rec; rec = ihex_next_binrec(rec)) {
+		/* Zero length marks end of records */
+		if (!be16_to_cpu(rec->len))
+			break;
+
+		/* Check max data size */
+		if (be16_to_cpu(rec->len) > ZIIRAVE_FIRM_PKT_DATA_SIZE) {
+			dev_err(&client->dev, "Firmware packet too long (%d)\n",
+				be16_to_cpu(rec->len));
+			return -EMSGSIZE;
+		}
+
+		/* Calculate words till page break */
+		words_till_page_break = (64 - ((be32_to_cpu(rec->addr) >> 1) &
+					 0x3f));
+		if ((be16_to_cpu(rec->len) >> 1) > words_till_page_break) {
+			/*
+			 * Data in passes page boundary, so we need to split in
+			 * two blocks of data. Create a packet with the first
+			 * block of data.
+			 */
+			rec_new = kzalloc(sizeof(struct ihex_binrec) +
+					  (words_till_page_break << 1),
+					  GFP_KERNEL);
+			if (!rec_new)
+				return -ENOMEM;
+
+			rec_new->len = cpu_to_be16(words_till_page_break << 1);
+			rec_new->addr = rec->addr;
+			memcpy(rec_new->data, rec->data,
+			       be16_to_cpu(rec_new->len));
+
+			ret = ziirave_firm_write_pkt(wdd, rec_new);
+			kfree(rec_new);
+			if (ret)
+				return ret;
+
+			/* Create a packet with the second block of data */
+			rec_new = kzalloc(sizeof(struct ihex_binrec) +
+					  be16_to_cpu(rec->len) -
+					  (words_till_page_break << 1),
+					  GFP_KERNEL);
+			if (!rec_new)
+				return -ENOMEM;
+
+			/* Remaining bytes */
+			rec_new->len = rec->len -
+				       cpu_to_be16(words_till_page_break << 1);
+
+			rec_new->addr = cpu_to_be32(be32_to_cpu(rec->addr) +
+					(words_till_page_break << 1));
+
+			memcpy(rec_new->data,
+			       rec->data + (words_till_page_break << 1),
+			       be16_to_cpu(rec_new->len));
+
+			ret = ziirave_firm_write_pkt(wdd, rec_new);
+			kfree(rec_new);
+			if (ret)
+				return ret;
+		} else {
+			ret = ziirave_firm_write_pkt(wdd, rec);
+			if (ret)
+				return ret;
+		}
+	}
+
+	/* For end of download, the length field will be set to 0 */
+	rec_new = kzalloc(sizeof(struct ihex_binrec) + 1, GFP_KERNEL);
+	if (!rec_new)
+		return -ENOMEM;
+
+	ret = ziirave_firm_write_pkt(wdd, rec_new);
+	kfree(rec_new);
+	if (ret) {
+		dev_err(&client->dev, "Failed to send EMPTY packet: %d\n", ret);
+		return ret;
+	}
+
+	/* This sleep seems to be required */
+	msleep(20);
+
+	/* Start firmware verification */
+	ret = ziirave_firm_verify(wdd, fw);
+	if (ret) {
+		dev_err(&client->dev,
+			"Failed to verify firmware: %d\n", ret);
+		return ret;
+	}
+
+	/* End download operation */
+	ret = ziirave_firm_write_byte(wdd, ZIIRAVE_CMD_DOWNLOAD_END, 1, false);
+	if (ret)
+		return ret;
+
+	/* Reset the processor */
+	ret = ziirave_firm_write_byte(wdd, ZIIRAVE_CMD_RESET_PROCESSOR, 1,
+				      false);
+	if (ret)
+		return ret;
+
+	msleep(500);
+
+	return 0;
+}
+
 static const struct watchdog_info ziirave_wdt_info = {
 	.options = WDIOF_SETTIMEOUT | WDIOF_MAGICCLOSE | WDIOF_KEEPALIVEPING,
 	.identity = "Zodiac RAVE Watchdog",
@@ -166,9 +481,18 @@ static ssize_t ziirave_wdt_sysfs_show_firm(struct device *dev,
 {
 	struct i2c_client *client = to_i2c_client(dev->parent);
 	struct ziirave_wdt_data *w_priv = i2c_get_clientdata(client);
+	int ret;
+
+	ret = mutex_lock_interruptible(&w_priv->sysfs_mutex);
+	if (ret)
+		return ret;
+
+	ret = sprintf(buf, "02.%02u.%02u", w_priv->firmware_rev.major,
+		      w_priv->firmware_rev.minor);
 
-	return sprintf(buf, "02.%02u.%02u", w_priv->firmware_rev.major,
-		       w_priv->firmware_rev.minor);
+	mutex_unlock(&w_priv->sysfs_mutex);
+
+	return ret;
 }
 
 static DEVICE_ATTR(firmware_version, S_IRUGO, ziirave_wdt_sysfs_show_firm,
@@ -180,9 +504,18 @@ static ssize_t ziirave_wdt_sysfs_show_boot(struct device *dev,
 {
 	struct i2c_client *client = to_i2c_client(dev->parent);
 	struct ziirave_wdt_data *w_priv = i2c_get_clientdata(client);
+	int ret;
 
-	return sprintf(buf, "01.%02u.%02u", w_priv->bootloader_rev.major,
-		       w_priv->bootloader_rev.minor);
+	ret = mutex_lock_interruptible(&w_priv->sysfs_mutex);
+	if (ret)
+		return ret;
+
+	ret = sprintf(buf, "01.%02u.%02u", w_priv->bootloader_rev.major,
+		      w_priv->bootloader_rev.minor);
+
+	mutex_unlock(&w_priv->sysfs_mutex);
+
+	return ret;
 }
 
 static DEVICE_ATTR(bootloader_version, S_IRUGO, ziirave_wdt_sysfs_show_boot,
@@ -194,17 +527,81 @@ static ssize_t ziirave_wdt_sysfs_show_reason(struct device *dev,
 {
 	struct i2c_client *client = to_i2c_client(dev->parent);
 	struct ziirave_wdt_data *w_priv = i2c_get_clientdata(client);
+	int ret;
+
+	ret = mutex_lock_interruptible(&w_priv->sysfs_mutex);
+	if (ret)
+		return ret;
+
+	ret = sprintf(buf, "%s", ziirave_reasons[w_priv->reset_reason]);
 
-	return sprintf(buf, "%s", ziirave_reasons[w_priv->reset_reason]);
+	mutex_unlock(&w_priv->sysfs_mutex);
+
+	return ret;
 }
 
 static DEVICE_ATTR(reset_reason, S_IRUGO, ziirave_wdt_sysfs_show_reason,
 		   NULL);
 
+static ssize_t ziirave_wdt_sysfs_store_firm(struct device *dev,
+					    struct device_attribute *attr,
+					    const char *buf, size_t count)
+{
+	struct i2c_client *client = to_i2c_client(dev->parent);
+	struct ziirave_wdt_data *w_priv = i2c_get_clientdata(client);
+	const struct firmware *fw;
+	int err;
+
+	err = request_ihex_firmware(&fw, ZIIRAVE_FW_NAME, dev);
+	if (err) {
+		dev_err(&client->dev, "Failed to request ihex firmware\n");
+		return err;
+	}
+
+	err = mutex_lock_interruptible(&w_priv->sysfs_mutex);
+	if (err)
+		goto release_firmware;
+
+	err = ziirave_firm_upload(&w_priv->wdd, fw);
+	if (err) {
+		dev_err(&client->dev, "The firmware update failed: %d\n", err);
+		goto unlock_mutex;
+	}
+
+	/* Update firmware version */
+	err = ziirave_wdt_revision(client, &w_priv->firmware_rev,
+				   ZIIRAVE_WDT_FIRM_VER_MAJOR);
+	if (err) {
+		dev_err(&client->dev, "Failed to read firmware version: %d\n",
+			err);
+		goto unlock_mutex;
+	}
+
+	dev_info(&client->dev, "Firmware updated to version 02.%02u.%02u\n",
+		 w_priv->firmware_rev.major, w_priv->firmware_rev.minor);
+
+	/* Restore the watchdog timeout */
+	err = ziirave_wdt_set_timeout(&w_priv->wdd, w_priv->wdd.timeout);
+	if (err)
+		dev_err(&client->dev, "Failed to set timeout: %d\n", err);
+
+unlock_mutex:
+	mutex_unlock(&w_priv->sysfs_mutex);
+
+release_firmware:
+	release_firmware(fw);
+
+	return err ? err : count;
+}
+
+static DEVICE_ATTR(update_firmware, S_IWUSR, NULL,
+		   ziirave_wdt_sysfs_store_firm);
+
 static struct attribute *ziirave_wdt_attrs[] = {
 	&dev_attr_firmware_version.attr,
 	&dev_attr_bootloader_version.attr,
 	&dev_attr_reset_reason.attr,
+	&dev_attr_update_firmware.attr,
 	NULL
 };
 ATTRIBUTE_GROUPS(ziirave_wdt);
@@ -252,6 +649,8 @@ static int ziirave_wdt_probe(struct i2c_client *client,
 	if (!w_priv)
 		return -ENOMEM;
 
+	mutex_init(&w_priv->sysfs_mutex);
+
 	w_priv->wdd.info = &ziirave_wdt_info;
 	w_priv->wdd.ops = &ziirave_wdt_ops;
 	w_priv->wdd.min_timeout = ZIIRAVE_TIMEOUT_MIN;
diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c
index c1e9f29c924c..f2d7402abe02 100644
--- a/fs/compat_ioctl.c
+++ b/fs/compat_ioctl.c
@@ -1209,6 +1209,8 @@ COMPATIBLE_IOCTL(WDIOC_SETOPTIONS)
 COMPATIBLE_IOCTL(WDIOC_KEEPALIVE)
 COMPATIBLE_IOCTL(WDIOC_SETTIMEOUT)
 COMPATIBLE_IOCTL(WDIOC_GETTIMEOUT)
+COMPATIBLE_IOCTL(WDIOC_SETPRETIMEOUT)
+COMPATIBLE_IOCTL(WDIOC_GETPRETIMEOUT)
 /* Big R */
 COMPATIBLE_IOCTL(RNDGETENTCNT)
 COMPATIBLE_IOCTL(RNDADDTOENTCNT)
diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c
index 207ba8d627ca..a4b531be9168 100644
--- a/fs/exportfs/expfs.c
+++ b/fs/exportfs/expfs.c
@@ -428,10 +428,10 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 	if (!nop || !nop->fh_to_dentry)
 		return ERR_PTR(-ESTALE);
 	result = nop->fh_to_dentry(mnt->mnt_sb, fid, fh_len, fileid_type);
-	if (!result)
-		result = ERR_PTR(-ESTALE);
-	if (IS_ERR(result))
-		return result;
+	if (PTR_ERR(result) == -ENOMEM)
+		return ERR_CAST(result);
+	if (IS_ERR_OR_NULL(result))
+		return ERR_PTR(-ESTALE);
 
 	if (d_is_dir(result)) {
 		/*
@@ -541,6 +541,8 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid,
 
  err_result:
 	dput(result);
+	if (err != -ENOMEM)
+		err = -ESTALE;
 	return ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(exportfs_decode_fh);
diff --git a/fs/nfs/cache_lib.c b/fs/nfs/cache_lib.c
index 5f7b053720ee..6de15709d024 100644
--- a/fs/nfs/cache_lib.c
+++ b/fs/nfs/cache_lib.c
@@ -76,7 +76,7 @@ static void nfs_dns_cache_revisit(struct cache_deferred_req *d, int toomany)
 
 	dreq = container_of(d, struct nfs_cache_defer_req, deferred_req);
 
-	complete_all(&dreq->completion);
+	complete(&dreq->completion);
 	nfs_cache_defer_req_put(dreq);
 }
 
diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 52a28311e2a4..532d8e242d4d 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -31,8 +31,6 @@
 struct nfs_callback_data {
 	unsigned int users;
 	struct svc_serv *serv;
-	struct svc_rqst *rqst;
-	struct task_struct *task;
 };
 
 static struct nfs_callback_data nfs_callback_info[NFS4_MAX_MINOR_VERSION + 1];
@@ -89,15 +87,6 @@ nfs4_callback_svc(void *vrqstp)
 	return 0;
 }
 
-/*
- * Prepare to bring up the NFSv4 callback service
- */
-static struct svc_rqst *
-nfs4_callback_up(struct svc_serv *serv)
-{
-	return svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
-}
-
 #if defined(CONFIG_NFS_V4_1)
 /*
  * The callback service for NFSv4.1 callbacks
@@ -139,29 +128,6 @@ nfs41_callback_svc(void *vrqstp)
 	return 0;
 }
 
-/*
- * Bring up the NFSv4.1 callback service
- */
-static struct svc_rqst *
-nfs41_callback_up(struct svc_serv *serv)
-{
-	struct svc_rqst *rqstp;
-
-	INIT_LIST_HEAD(&serv->sv_cb_list);
-	spin_lock_init(&serv->sv_cb_lock);
-	init_waitqueue_head(&serv->sv_cb_waitq);
-	rqstp = svc_prepare_thread(serv, &serv->sv_pools[0], NUMA_NO_NODE);
-	dprintk("--> %s return %d\n", __func__, PTR_ERR_OR_ZERO(rqstp));
-	return rqstp;
-}
-
-static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,
-		struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
-{
-	*rqstpp = nfs41_callback_up(serv);
-	*callback_svc = nfs41_callback_svc;
-}
-
 static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
 		struct svc_serv *serv)
 {
@@ -173,13 +139,6 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
 		xprt->bc_serv = serv;
 }
 #else
-static void nfs_minorversion_callback_svc_setup(struct svc_serv *serv,
-		struct svc_rqst **rqstpp, int (**callback_svc)(void *vrqstp))
-{
-	*rqstpp = ERR_PTR(-ENOTSUPP);
-	*callback_svc = ERR_PTR(-ENOTSUPP);
-}
-
 static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
 		struct svc_serv *serv)
 {
@@ -189,45 +148,22 @@ static inline void nfs_callback_bc_serv(u32 minorversion, struct rpc_xprt *xprt,
 static int nfs_callback_start_svc(int minorversion, struct rpc_xprt *xprt,
 				  struct svc_serv *serv)
 {
-	struct svc_rqst *rqstp;
-	int (*callback_svc)(void *vrqstp);
-	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+	int nrservs = nfs_callback_nr_threads;
 	int ret;
 
 	nfs_callback_bc_serv(minorversion, xprt, serv);
 
-	if (cb_info->task)
-		return 0;
+	if (nrservs < NFS4_MIN_NR_CALLBACK_THREADS)
+		nrservs = NFS4_MIN_NR_CALLBACK_THREADS;
 
-	switch (minorversion) {
-	case 0:
-		/* v4.0 callback setup */
-		rqstp = nfs4_callback_up(serv);
-		callback_svc = nfs4_callback_svc;
-		break;
-	default:
-		nfs_minorversion_callback_svc_setup(serv,
-				&rqstp, &callback_svc);
-	}
-
-	if (IS_ERR(rqstp))
-		return PTR_ERR(rqstp);
-
-	svc_sock_update_bufs(serv);
+	if (serv->sv_nrthreads-1 == nrservs)
+		return 0;
 
-	cb_info->serv = serv;
-	cb_info->rqst = rqstp;
-	cb_info->task = kthread_create(callback_svc, cb_info->rqst,
-				    "nfsv4.%u-svc", minorversion);
-	if (IS_ERR(cb_info->task)) {
-		ret = PTR_ERR(cb_info->task);
-		svc_exit_thread(cb_info->rqst);
-		cb_info->rqst = NULL;
-		cb_info->task = NULL;
+	ret = serv->sv_ops->svo_setup(serv, NULL, nrservs);
+	if (ret) {
+		serv->sv_ops->svo_setup(serv, NULL, 0);
 		return ret;
 	}
-	rqstp->rq_task = cb_info->task;
-	wake_up_process(cb_info->task);
 	dprintk("nfs_callback_up: service started\n");
 	return 0;
 }
@@ -281,19 +217,41 @@ err_bind:
 	return ret;
 }
 
-static struct svc_serv_ops nfs_cb_sv_ops = {
+static struct svc_serv_ops nfs40_cb_sv_ops = {
+	.svo_function		= nfs4_callback_svc,
 	.svo_enqueue_xprt	= svc_xprt_do_enqueue,
+	.svo_setup		= svc_set_num_threads,
+	.svo_module		= THIS_MODULE,
+};
+#if defined(CONFIG_NFS_V4_1)
+static struct svc_serv_ops nfs41_cb_sv_ops = {
+	.svo_function		= nfs41_callback_svc,
+	.svo_enqueue_xprt	= svc_xprt_do_enqueue,
+	.svo_setup		= svc_set_num_threads,
+	.svo_module		= THIS_MODULE,
+};
+
+struct svc_serv_ops *nfs4_cb_sv_ops[] = {
+	[0] = &nfs40_cb_sv_ops,
+	[1] = &nfs41_cb_sv_ops,
+};
+#else
+struct svc_serv_ops *nfs4_cb_sv_ops[] = {
+	[0] = &nfs40_cb_sv_ops,
+	[1] = NULL,
 };
+#endif
 
 static struct svc_serv *nfs_callback_create_svc(int minorversion)
 {
 	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
 	struct svc_serv *serv;
+	struct svc_serv_ops *sv_ops;
 
 	/*
 	 * Check whether we're already up and running.
 	 */
-	if (cb_info->task) {
+	if (cb_info->serv) {
 		/*
 		 * Note: increase service usage, because later in case of error
 		 * svc_destroy() will be called.
@@ -302,6 +260,17 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
 		return cb_info->serv;
 	}
 
+	switch (minorversion) {
+	case 0:
+		sv_ops = nfs4_cb_sv_ops[0];
+		break;
+	default:
+		sv_ops = nfs4_cb_sv_ops[1];
+	}
+
+	if (sv_ops == NULL)
+		return ERR_PTR(-ENOTSUPP);
+
 	/*
 	 * Sanity check: if there's no task,
 	 * we should be the first user ...
@@ -310,11 +279,12 @@ static struct svc_serv *nfs_callback_create_svc(int minorversion)
 		printk(KERN_WARNING "nfs_callback_create_svc: no kthread, %d users??\n",
 			cb_info->users);
 
-	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, &nfs_cb_sv_ops);
+	serv = svc_create(&nfs4_callback_program, NFS4_CALLBACK_BUFSIZE, sv_ops);
 	if (!serv) {
 		printk(KERN_ERR "nfs_callback_create_svc: create service failed\n");
 		return ERR_PTR(-ENOMEM);
 	}
+	cb_info->serv = serv;
 	/* As there is only one thread we need to over-ride the
 	 * default maximum of 80 connections
 	 */
@@ -357,6 +327,8 @@ int nfs_callback_up(u32 minorversion, struct rpc_xprt *xprt)
 	 * thread exits.
 	 */
 err_net:
+	if (!cb_info->users)
+		cb_info->serv = NULL;
 	svc_destroy(serv);
 err_create:
 	mutex_unlock(&nfs_callback_mutex);
@@ -374,18 +346,18 @@ err_start:
 void nfs_callback_down(int minorversion, struct net *net)
 {
 	struct nfs_callback_data *cb_info = &nfs_callback_info[minorversion];
+	struct svc_serv *serv;
 
 	mutex_lock(&nfs_callback_mutex);
-	nfs_callback_down_net(minorversion, cb_info->serv, net);
+	serv = cb_info->serv;
+	nfs_callback_down_net(minorversion, serv, net);
 	cb_info->users--;
-	if (cb_info->users == 0 && cb_info->task != NULL) {
-		kthread_stop(cb_info->task);
-		dprintk("nfs_callback_down: service stopped\n");
-		svc_exit_thread(cb_info->rqst);
+	if (cb_info->users == 0) {
+		svc_get(serv);
+		serv->sv_ops->svo_setup(serv, NULL, 0);
+		svc_destroy(serv);
 		dprintk("nfs_callback_down: service destroyed\n");
 		cb_info->serv = NULL;
-		cb_info->rqst = NULL;
-		cb_info->task = NULL;
 	}
 	mutex_unlock(&nfs_callback_mutex);
 }
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 5fe1cecbf9f0..c701c308fac5 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -179,6 +179,15 @@ extern __be32 nfs4_callback_devicenotify(
 	struct cb_devicenotifyargs *args,
 	void *dummy, struct cb_process_state *cps);
 
+struct cb_notify_lock_args {
+	struct nfs_fh			cbnl_fh;
+	struct nfs_lowner		cbnl_owner;
+	bool				cbnl_valid;
+};
+
+extern __be32 nfs4_callback_notify_lock(struct cb_notify_lock_args *args,
+					 void *dummy,
+					 struct cb_process_state *cps);
 #endif /* CONFIG_NFS_V4_1 */
 extern int check_gss_callback_principal(struct nfs_client *, struct svc_rqst *);
 extern __be32 nfs4_callback_getattr(struct cb_getattrargs *args,
@@ -198,6 +207,9 @@ extern void nfs_callback_down(int minorversion, struct net *net);
 #define NFS41_BC_MIN_CALLBACKS 1
 #define NFS41_BC_MAX_CALLBACKS 1
 
+#define NFS4_MIN_NR_CALLBACK_THREADS 1
+
 extern unsigned int nfs_callback_set_tcpport;
+extern unsigned short nfs_callback_nr_threads;
 
 #endif /* __LINUX_FS_NFS_CALLBACK_H */
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index f953ef6b2f2e..e9aa235e9d10 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -628,4 +628,20 @@ out:
 	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
 	return status;
 }
+
+__be32 nfs4_callback_notify_lock(struct cb_notify_lock_args *args, void *dummy,
+				 struct cb_process_state *cps)
+{
+	if (!cps->clp) /* set in cb_sequence */
+		return htonl(NFS4ERR_OP_NOT_IN_SESSION);
+
+	dprintk_rcu("NFS: CB_NOTIFY_LOCK request from %s\n",
+		rpc_peeraddr2str(cps->clp->cl_rpcclient, RPC_DISPLAY_ADDR));
+
+	/* Don't wake anybody if the string looked bogus */
+	if (args->cbnl_valid)
+		__wake_up(&cps->clp->cl_lock_waitq, TASK_NORMAL, 0, args);
+
+	return htonl(NFS4_OK);
+}
 #endif /* CONFIG_NFS_V4_1 */
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 656f68f7fe53..eb094c6011d8 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -35,6 +35,7 @@
 					 (1 + 3) * 4) // seqid, 3 slotids
 #define CB_OP_RECALLANY_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
 #define CB_OP_RECALLSLOT_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
+#define CB_OP_NOTIFY_LOCK_RES_MAXSZ	(CB_OP_HDR_RES_MAXSZ)
 #endif /* CONFIG_NFS_V4_1 */
 
 #define NFSDBG_FACILITY NFSDBG_CALLBACK
@@ -72,7 +73,7 @@ static int nfs4_encode_void(struct svc_rqst *rqstp, __be32 *p, void *dummy)
 	return xdr_ressize_check(rqstp, p);
 }
 
-static __be32 *read_buf(struct xdr_stream *xdr, int nbytes)
+static __be32 *read_buf(struct xdr_stream *xdr, size_t nbytes)
 {
 	__be32 *p;
 
@@ -534,6 +535,49 @@ static __be32 decode_recallslot_args(struct svc_rqst *rqstp,
 	return 0;
 }
 
+static __be32 decode_lockowner(struct xdr_stream *xdr, struct cb_notify_lock_args *args)
+{
+	__be32		*p;
+	unsigned int	len;
+
+	p = read_buf(xdr, 12);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_BADXDR);
+
+	p = xdr_decode_hyper(p, &args->cbnl_owner.clientid);
+	len = be32_to_cpu(*p);
+
+	p = read_buf(xdr, len);
+	if (unlikely(p == NULL))
+		return htonl(NFS4ERR_BADXDR);
+
+	/* Only try to decode if the length is right */
+	if (len == 20) {
+		p += 2;	/* skip "lock id:" */
+		args->cbnl_owner.s_dev = be32_to_cpu(*p++);
+		xdr_decode_hyper(p, &args->cbnl_owner.id);
+		args->cbnl_valid = true;
+	} else {
+		args->cbnl_owner.s_dev = 0;
+		args->cbnl_owner.id = 0;
+		args->cbnl_valid = false;
+	}
+	return 0;
+}
+
+static __be32 decode_notify_lock_args(struct svc_rqst *rqstp, struct xdr_stream *xdr, struct cb_notify_lock_args *args)
+{
+	__be32 status;
+
+	status = decode_fh(xdr, &args->cbnl_fh);
+	if (unlikely(status != 0))
+		goto out;
+	status = decode_lockowner(xdr, args);
+out:
+	dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
+	return status;
+}
+
 #endif /* CONFIG_NFS_V4_1 */
 
 static __be32 encode_string(struct xdr_stream *xdr, unsigned int len, const char *str)
@@ -746,6 +790,7 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
 	case OP_CB_RECALL_SLOT:
 	case OP_CB_LAYOUTRECALL:
 	case OP_CB_NOTIFY_DEVICEID:
+	case OP_CB_NOTIFY_LOCK:
 		*op = &callback_ops[op_nr];
 		break;
 
@@ -753,7 +798,6 @@ preprocess_nfs41_op(int nop, unsigned int op_nr, struct callback_op **op)
 	case OP_CB_PUSH_DELEG:
 	case OP_CB_RECALLABLE_OBJ_AVAIL:
 	case OP_CB_WANTS_CANCELLED:
-	case OP_CB_NOTIFY_LOCK:
 		return htonl(NFS4ERR_NOTSUPP);
 
 	default:
@@ -1006,6 +1050,11 @@ static struct callback_op callback_ops[] = {
 		.decode_args = (callback_decode_arg_t)decode_recallslot_args,
 		.res_maxsize = CB_OP_RECALLSLOT_RES_MAXSZ,
 	},
+	[OP_CB_NOTIFY_LOCK] = {
+		.process_op = (callback_process_op_t)nfs4_callback_notify_lock,
+		.decode_args = (callback_decode_arg_t)decode_notify_lock_args,
+		.res_maxsize = CB_OP_NOTIFY_LOCK_RES_MAXSZ,
+	},
 #endif /* CONFIG_NFS_V4_1 */
 };
 
diff --git a/fs/nfs/client.c b/fs/nfs/client.c
index 1e106780a237..7555ba889d1f 100644
--- a/fs/nfs/client.c
+++ b/fs/nfs/client.c
@@ -313,7 +313,10 @@ static struct nfs_client *nfs_match_client(const struct nfs_client_initdata *dat
 			continue;
 		/* Match the full socket address */
 		if (!rpc_cmp_addr_port(sap, clap))
-			continue;
+			/* Match all xprt_switch full socket addresses */
+			if (!rpc_clnt_xprt_switch_has_addr(clp->cl_rpcclient,
+							   sap))
+				continue;
 
 		atomic_inc(&clp->cl_count);
 		return clp;
@@ -785,7 +788,8 @@ int nfs_probe_fsinfo(struct nfs_server *server, struct nfs_fh *mntfh, struct nfs
 	}
 
 	fsinfo.fattr = fattr;
-	fsinfo.layouttype = 0;
+	fsinfo.nlayouttypes = 0;
+	memset(fsinfo.layouttype, 0, sizeof(fsinfo.layouttype));
 	error = clp->rpc_ops->fsinfo(server, mntfh, &fsinfo);
 	if (error < 0)
 		goto out_error;
@@ -1078,7 +1082,7 @@ void nfs_clients_init(struct net *net)
 	idr_init(&nn->cb_ident_idr);
 #endif
 	spin_lock_init(&nn->nfs_client_lock);
-	nn->boot_time = CURRENT_TIME;
+	nn->boot_time = ktime_get_real();
 }
 
 #ifdef CONFIG_PROC_FS
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 322c2585bc34..dff600ae0d74 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -41,6 +41,17 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation)
 	set_bit(NFS_DELEGATION_REFERENCED, &delegation->flags);
 }
 
+static bool
+nfs4_is_valid_delegation(const struct nfs_delegation *delegation,
+		fmode_t flags)
+{
+	if (delegation != NULL && (delegation->type & flags) == flags &&
+	    !test_bit(NFS_DELEGATION_REVOKED, &delegation->flags) &&
+	    !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags))
+		return true;
+	return false;
+}
+
 static int
 nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
 {
@@ -50,8 +61,7 @@ nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark)
 	flags &= FMODE_READ|FMODE_WRITE;
 	rcu_read_lock();
 	delegation = rcu_dereference(NFS_I(inode)->delegation);
-	if (delegation != NULL && (delegation->type & flags) == flags &&
-	    !test_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) {
+	if (nfs4_is_valid_delegation(delegation, flags)) {
 		if (mark)
 			nfs_mark_delegation_referenced(delegation);
 		ret = 1;
@@ -185,15 +195,13 @@ void nfs_inode_reclaim_delegation(struct inode *inode, struct rpc_cred *cred,
 			rcu_read_unlock();
 			put_rpccred(oldcred);
 			trace_nfs4_reclaim_delegation(inode, res->delegation_type);
-		} else {
-			/* We appear to have raced with a delegation return. */
-			spin_unlock(&delegation->lock);
-			rcu_read_unlock();
-			nfs_inode_set_delegation(inode, cred, res);
+			return;
 		}
-	} else {
-		rcu_read_unlock();
+		/* We appear to have raced with a delegation return. */
+		spin_unlock(&delegation->lock);
 	}
+	rcu_read_unlock();
+	nfs_inode_set_delegation(inode, cred, res);
 }
 
 static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
@@ -642,28 +650,49 @@ static void nfs_client_mark_return_unused_delegation_types(struct nfs_client *cl
 	rcu_read_unlock();
 }
 
-static void nfs_revoke_delegation(struct inode *inode)
+static void nfs_mark_delegation_revoked(struct nfs_server *server,
+		struct nfs_delegation *delegation)
+{
+	set_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
+	delegation->stateid.type = NFS4_INVALID_STATEID_TYPE;
+	nfs_mark_return_delegation(server, delegation);
+}
+
+static bool nfs_revoke_delegation(struct inode *inode,
+		const nfs4_stateid *stateid)
 {
 	struct nfs_delegation *delegation;
+	nfs4_stateid tmp;
+	bool ret = false;
+
 	rcu_read_lock();
 	delegation = rcu_dereference(NFS_I(inode)->delegation);
-	if (delegation != NULL) {
-		set_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
-		nfs_mark_return_delegation(NFS_SERVER(inode), delegation);
-	}
+	if (delegation == NULL)
+		goto out;
+	if (stateid == NULL) {
+		nfs4_stateid_copy(&tmp, &delegation->stateid);
+		stateid = &tmp;
+	} else if (!nfs4_stateid_match(stateid, &delegation->stateid))
+		goto out;
+	nfs_mark_delegation_revoked(NFS_SERVER(inode), delegation);
+	ret = true;
+out:
 	rcu_read_unlock();
+	if (ret)
+		nfs_inode_find_state_and_recover(inode, stateid);
+	return ret;
 }
 
-void nfs_remove_bad_delegation(struct inode *inode)
+void nfs_remove_bad_delegation(struct inode *inode,
+		const nfs4_stateid *stateid)
 {
 	struct nfs_delegation *delegation;
 
-	nfs_revoke_delegation(inode);
+	if (!nfs_revoke_delegation(inode, stateid))
+		return;
 	delegation = nfs_inode_detach_delegation(inode);
-	if (delegation) {
-		nfs_inode_find_state_and_recover(inode, &delegation->stateid);
+	if (delegation)
 		nfs_free_delegation(delegation);
-	}
 }
 EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation);
 
@@ -786,8 +815,15 @@ static void nfs_delegation_mark_reclaim_server(struct nfs_server *server)
 {
 	struct nfs_delegation *delegation;
 
-	list_for_each_entry_rcu(delegation, &server->delegations, super_list)
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+		/*
+		 * If the delegation may have been admin revoked, then we
+		 * cannot reclaim it.
+		 */
+		if (test_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags))
+			continue;
 		set_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+	}
 }
 
 /**
@@ -851,6 +887,141 @@ restart:
 	rcu_read_unlock();
 }
 
+static inline bool nfs4_server_rebooted(const struct nfs_client *clp)
+{
+	return (clp->cl_state & (BIT(NFS4CLNT_CHECK_LEASE) |
+				BIT(NFS4CLNT_LEASE_EXPIRED) |
+				BIT(NFS4CLNT_SESSION_RESET))) != 0;
+}
+
+static void nfs_mark_test_expired_delegation(struct nfs_server *server,
+	    struct nfs_delegation *delegation)
+{
+	if (delegation->stateid.type == NFS4_INVALID_STATEID_TYPE)
+		return;
+	clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+	set_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
+	set_bit(NFS4CLNT_DELEGATION_EXPIRED, &server->nfs_client->cl_state);
+}
+
+static void nfs_inode_mark_test_expired_delegation(struct nfs_server *server,
+		struct inode *inode)
+{
+	struct nfs_delegation *delegation;
+
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(inode)->delegation);
+	if (delegation)
+		nfs_mark_test_expired_delegation(server, delegation);
+	rcu_read_unlock();
+
+}
+
+static void nfs_delegation_mark_test_expired_server(struct nfs_server *server)
+{
+	struct nfs_delegation *delegation;
+
+	list_for_each_entry_rcu(delegation, &server->delegations, super_list)
+		nfs_mark_test_expired_delegation(server, delegation);
+}
+
+/**
+ * nfs_mark_test_expired_all_delegations - mark all delegations for testing
+ * @clp: nfs_client to process
+ *
+ * Iterates through all the delegations associated with this server and
+ * marks them as needing to be checked for validity.
+ */
+void nfs_mark_test_expired_all_delegations(struct nfs_client *clp)
+{
+	struct nfs_server *server;
+
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
+		nfs_delegation_mark_test_expired_server(server);
+	rcu_read_unlock();
+}
+
+/**
+ * nfs_reap_expired_delegations - reap expired delegations
+ * @clp: nfs_client to process
+ *
+ * Iterates through all the delegations associated with this server and
+ * checks if they have may have been revoked. This function is usually
+ * expected to be called in cases where the server may have lost its
+ * lease.
+ */
+void nfs_reap_expired_delegations(struct nfs_client *clp)
+{
+	const struct nfs4_minor_version_ops *ops = clp->cl_mvops;
+	struct nfs_delegation *delegation;
+	struct nfs_server *server;
+	struct inode *inode;
+	struct rpc_cred *cred;
+	nfs4_stateid stateid;
+
+restart:
+	rcu_read_lock();
+	list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
+		list_for_each_entry_rcu(delegation, &server->delegations,
+								super_list) {
+			if (test_bit(NFS_DELEGATION_RETURNING,
+						&delegation->flags))
+				continue;
+			if (test_bit(NFS_DELEGATION_TEST_EXPIRED,
+						&delegation->flags) == 0)
+				continue;
+			if (!nfs_sb_active(server->super))
+				continue;
+			inode = nfs_delegation_grab_inode(delegation);
+			if (inode == NULL) {
+				rcu_read_unlock();
+				nfs_sb_deactive(server->super);
+				goto restart;
+			}
+			cred = get_rpccred_rcu(delegation->cred);
+			nfs4_stateid_copy(&stateid, &delegation->stateid);
+			clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
+			rcu_read_unlock();
+			if (cred != NULL &&
+			    ops->test_and_free_expired(server, &stateid, cred) < 0) {
+				nfs_revoke_delegation(inode, &stateid);
+				nfs_inode_find_state_and_recover(inode, &stateid);
+			}
+			put_rpccred(cred);
+			if (nfs4_server_rebooted(clp)) {
+				nfs_inode_mark_test_expired_delegation(server,inode);
+				iput(inode);
+				nfs_sb_deactive(server->super);
+				return;
+			}
+			iput(inode);
+			nfs_sb_deactive(server->super);
+			goto restart;
+		}
+	}
+	rcu_read_unlock();
+}
+
+void nfs_inode_find_delegation_state_and_recover(struct inode *inode,
+		const nfs4_stateid *stateid)
+{
+	struct nfs_client *clp = NFS_SERVER(inode)->nfs_client;
+	struct nfs_delegation *delegation;
+	bool found = false;
+
+	rcu_read_lock();
+	delegation = rcu_dereference(NFS_I(inode)->delegation);
+	if (delegation &&
+	    nfs4_stateid_match_other(&delegation->stateid, stateid)) {
+		nfs_mark_test_expired_delegation(NFS_SERVER(inode), delegation);
+		found = true;
+	}
+	rcu_read_unlock();
+	if (found)
+		nfs4_schedule_state_manager(clp);
+}
+
 /**
  * nfs_delegations_present - check for existence of delegations
  * @clp: client state handle
@@ -893,7 +1064,7 @@ bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags,
 	flags &= FMODE_READ|FMODE_WRITE;
 	rcu_read_lock();
 	delegation = rcu_dereference(nfsi->delegation);
-	ret = (delegation != NULL && (delegation->type & flags) == flags);
+	ret = nfs4_is_valid_delegation(delegation, flags);
 	if (ret) {
 		nfs4_stateid_copy(dst, &delegation->stateid);
 		nfs_mark_delegation_referenced(delegation);
diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h
index 64724d252a79..e9d555796873 100644
--- a/fs/nfs/delegation.h
+++ b/fs/nfs/delegation.h
@@ -32,6 +32,7 @@ enum {
 	NFS_DELEGATION_REFERENCED,
 	NFS_DELEGATION_RETURNING,
 	NFS_DELEGATION_REVOKED,
+	NFS_DELEGATION_TEST_EXPIRED,
 };
 
 int nfs_inode_set_delegation(struct inode *inode, struct rpc_cred *cred, struct nfs_openres *res);
@@ -47,11 +48,14 @@ void nfs_expire_unused_delegation_types(struct nfs_client *clp, fmode_t flags);
 void nfs_expire_unreferenced_delegations(struct nfs_client *clp);
 int nfs_client_return_marked_delegations(struct nfs_client *clp);
 int nfs_delegations_present(struct nfs_client *clp);
-void nfs_remove_bad_delegation(struct inode *inode);
+void nfs_remove_bad_delegation(struct inode *inode, const nfs4_stateid *stateid);
 
 void nfs_delegation_mark_reclaim(struct nfs_client *clp);
 void nfs_delegation_reap_unclaimed(struct nfs_client *clp);
 
+void nfs_mark_test_expired_all_delegations(struct nfs_client *clp);
+void nfs_reap_expired_delegations(struct nfs_client *clp);
+
 /* NFSv4 delegation-related procedures */
 int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4_stateid *stateid, int issync);
 int nfs4_open_delegation_recall(struct nfs_open_context *ctx, struct nfs4_state *state, const nfs4_stateid *stateid, fmode_t type);
@@ -62,6 +66,8 @@ void nfs_mark_delegation_referenced(struct nfs_delegation *delegation);
 int nfs4_have_delegation(struct inode *inode, fmode_t flags);
 int nfs4_check_delegation(struct inode *inode, fmode_t flags);
 bool nfs4_delegation_flush_on_close(const struct inode *inode);
+void nfs_inode_find_delegation_state_and_recover(struct inode *inode,
+		const nfs4_stateid *stateid);
 
 #endif
 
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 06e0bf092ba9..5f1af4cd1a33 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -435,11 +435,11 @@ int nfs_same_file(struct dentry *dentry, struct nfs_entry *entry)
 		return 0;
 
 	nfsi = NFS_I(inode);
-	if (entry->fattr->fileid == nfsi->fileid)
-		return 1;
-	if (nfs_compare_fh(entry->fh, &nfsi->fh) == 0)
-		return 1;
-	return 0;
+	if (entry->fattr->fileid != nfsi->fileid)
+		return 0;
+	if (entry->fh->size && nfs_compare_fh(entry->fh, &nfsi->fh) != 0)
+		return 0;
+	return 1;
 }
 
 static
@@ -496,6 +496,14 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry)
 		return;
 	if (!(entry->fattr->valid & NFS_ATTR_FATTR_FSID))
 		return;
+	if (filename.len == 0)
+		return;
+	/* Validate that the name doesn't contain any illegal '\0' */
+	if (strnlen(filename.name, filename.len) != filename.len)
+		return;
+	/* ...or '/' */
+	if (strnchr(filename.name, filename.len, '/'))
+		return;
 	if (filename.name[0] == '.') {
 		if (filename.len == 1)
 			return;
@@ -517,6 +525,8 @@ again:
 					&entry->fattr->fsid))
 			goto out;
 		if (nfs_same_file(dentry, entry)) {
+			if (!entry->fh->size)
+				goto out;
 			nfs_set_verifier(dentry, nfs_save_change_attribute(dir));
 			status = nfs_refresh_inode(d_inode(dentry), entry->fattr);
 			if (!status)
@@ -529,6 +539,10 @@ again:
 			goto again;
 		}
 	}
+	if (!entry->fh->size) {
+		d_lookup_done(dentry);
+		goto out;
+	}
 
 	inode = nfs_fhget(dentry->d_sb, entry->fh, entry->fattr, entry->label);
 	alias = d_splice_alias(inode, dentry);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 72b7d13ee3c6..bd81bcf3ffcf 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -387,7 +387,7 @@ static void nfs_direct_complete(struct nfs_direct_req *dreq)
 		dreq->iocb->ki_complete(dreq->iocb, res, 0);
 	}
 
-	complete_all(&dreq->completion);
+	complete(&dreq->completion);
 
 	nfs_direct_req_release(dreq);
 }
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 2efbdde36c3e..9ea85ae23c32 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -520,7 +520,9 @@ const struct address_space_operations nfs_file_aops = {
 	.invalidatepage = nfs_invalidate_page,
 	.releasepage = nfs_release_page,
 	.direct_IO = nfs_direct_IO,
+#ifdef CONFIG_MIGRATION
 	.migratepage = nfs_migrate_page,
+#endif
 	.launder_page = nfs_launder_page,
 	.is_dirty_writeback = nfs_check_dirty_writeback,
 	.error_remove_page = generic_error_remove_page,
@@ -685,11 +687,6 @@ out_noconflict:
 	goto out;
 }
 
-static int do_vfs_lock(struct file *file, struct file_lock *fl)
-{
-	return locks_lock_file_wait(file, fl);
-}
-
 static int
 do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 {
@@ -722,7 +719,7 @@ do_unlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 	if (!is_local)
 		status = NFS_PROTO(inode)->lock(filp, cmd, fl);
 	else
-		status = do_vfs_lock(filp, fl);
+		status = locks_lock_file_wait(filp, fl);
 	return status;
 }
 
@@ -747,7 +744,7 @@ do_setlk(struct file *filp, int cmd, struct file_lock *fl, int is_local)
 	if (!is_local)
 		status = NFS_PROTO(inode)->lock(filp, cmd, fl);
 	else
-		status = do_vfs_lock(filp, fl);
+		status = locks_lock_file_wait(filp, fl);
 	if (status < 0)
 		goto out;
 
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index 51b51369704c..98ace127bf86 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -1080,7 +1080,7 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task,
 	case -NFS4ERR_BAD_STATEID:
 		if (state == NULL)
 			break;
-		nfs_remove_bad_delegation(state->inode);
+		nfs_remove_bad_delegation(state->inode, NULL);
 	case -NFS4ERR_OPENMODE:
 		if (state == NULL)
 			break;
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index a6acce663219..80bcc0befb07 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -534,12 +534,9 @@ void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
 }
 #endif
 
-
 #ifdef CONFIG_MIGRATION
 extern int nfs_migrate_page(struct address_space *,
 		struct page *, struct page *, enum migrate_mode);
-#else
-#define nfs_migrate_page NULL
 #endif
 
 static inline int
@@ -562,7 +559,6 @@ void nfs_init_cinfo_from_dreq(struct nfs_commit_info *cinfo,
 extern ssize_t nfs_dreq_bytes_left(struct nfs_direct_req *dreq);
 
 /* nfs4proc.c */
-extern void __nfs4_read_done_cb(struct nfs_pgio_header *);
 extern struct nfs_client *nfs4_init_client(struct nfs_client *clp,
 			    const struct nfs_client_initdata *);
 extern int nfs40_walk_client_list(struct nfs_client *clp,
@@ -571,6 +567,9 @@ extern int nfs40_walk_client_list(struct nfs_client *clp,
 extern int nfs41_walk_client_list(struct nfs_client *clp,
 				struct nfs_client **result,
 				struct rpc_cred *cred);
+extern int nfs4_test_session_trunk(struct rpc_clnt *,
+				struct rpc_xprt *,
+				void *);
 
 static inline struct inode *nfs_igrab_and_active(struct inode *inode)
 {
diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h
index f0e06e4acbef..fbce0d885d4c 100644
--- a/fs/nfs/netns.h
+++ b/fs/nfs/netns.h
@@ -29,7 +29,7 @@ struct nfs_net {
 	int cb_users[NFS4_MAX_MINOR_VERSION + 1];
 #endif
 	spinlock_t nfs_client_lock;
-	struct timespec boot_time;
+	ktime_t boot_time;
 #ifdef CONFIG_PROC_FS
 	struct proc_dir_entry *proc_nfsfs;
 #endif
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 64b43b4ad9dd..608501971fe0 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -443,6 +443,7 @@ int nfs42_proc_layoutstats_generic(struct nfs_server *server,
 	task = rpc_run_task(&task_setup);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
+	rpc_put_task(task);
 	return 0;
 }
 
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 9bf64eacba5b..9b3a82abab07 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -39,6 +39,7 @@ enum nfs4_client_state {
 	NFS4CLNT_BIND_CONN_TO_SESSION,
 	NFS4CLNT_MOVED,
 	NFS4CLNT_LEASE_MOVED,
+	NFS4CLNT_DELEGATION_EXPIRED,
 };
 
 #define NFS4_RENEW_TIMEOUT		0x01
@@ -57,8 +58,11 @@ struct nfs4_minor_version_ops {
 			struct nfs_fsinfo *);
 	void	(*free_lock_state)(struct nfs_server *,
 			struct nfs4_lock_state *);
+	int	(*test_and_free_expired)(struct nfs_server *,
+			nfs4_stateid *, struct rpc_cred *);
 	struct nfs_seqid *
 		(*alloc_seqid)(struct nfs_seqid_counter *, gfp_t);
+	int	(*session_trunk)(struct rpc_clnt *, struct rpc_xprt *, void *);
 	const struct rpc_call_ops *call_sync_ops;
 	const struct nfs4_state_recovery_ops *reboot_recovery_ops;
 	const struct nfs4_state_recovery_ops *nograce_recovery_ops;
@@ -156,6 +160,7 @@ enum {
 	NFS_STATE_RECLAIM_NOGRACE,	/* OPEN stateid needs to recover state */
 	NFS_STATE_POSIX_LOCKS,		/* Posix locks are supported */
 	NFS_STATE_RECOVERY_FAILED,	/* OPEN stateid state recovery failed */
+	NFS_STATE_MAY_NOTIFY_LOCK,	/* server may CB_NOTIFY_LOCK */
 };
 
 struct nfs4_state {
@@ -203,6 +208,11 @@ struct nfs4_state_recovery_ops {
 		struct rpc_cred *);
 };
 
+struct nfs4_add_xprt_data {
+	struct nfs_client	*clp;
+	struct rpc_cred		*cred;
+};
+
 struct nfs4_state_maintenance_ops {
 	int (*sched_state_renewal)(struct nfs_client *, struct rpc_cred *, unsigned);
 	struct rpc_cred * (*get_state_renewal_cred_locked)(struct nfs_client *);
@@ -278,6 +288,8 @@ extern int nfs4_proc_get_lease_time(struct nfs_client *clp,
 		struct nfs_fsinfo *fsinfo);
 extern int nfs4_proc_layoutcommit(struct nfs4_layoutcommit_data *data,
 				  bool sync);
+extern int nfs4_detect_session_trunking(struct nfs_client *clp,
+		struct nfs41_exchange_id_res *res, struct rpc_xprt *xprt);
 
 static inline bool
 is_ds_only_client(struct nfs_client *clp)
@@ -439,7 +451,7 @@ extern void nfs4_schedule_path_down_recovery(struct nfs_client *clp);
 extern int nfs4_schedule_stateid_recovery(const struct nfs_server *, struct nfs4_state *);
 extern int nfs4_schedule_migration_recovery(const struct nfs_server *);
 extern void nfs4_schedule_lease_moved_recovery(struct nfs_client *);
-extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags);
+extern void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags, bool);
 extern void nfs41_handle_server_scope(struct nfs_client *,
 				      struct nfs41_server_scope **);
 extern void nfs4_put_lock_state(struct nfs4_lock_state *lsp);
@@ -471,6 +483,7 @@ extern struct nfs_subversion nfs_v4;
 struct dentry *nfs4_try_mount(int, const char *, struct nfs_mount_info *, struct nfs_subversion *);
 extern bool nfs4_disable_idmapping;
 extern unsigned short max_session_slots;
+extern unsigned short max_session_cb_slots;
 extern unsigned short send_implementation_id;
 extern bool recover_lost_locks;
 
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index cd3b7cfdde16..074ac7131459 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -199,6 +199,9 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init)
 	clp->cl_minorversion = cl_init->minorversion;
 	clp->cl_mvops = nfs_v4_minor_ops[cl_init->minorversion];
 	clp->cl_mig_gen = 1;
+#if IS_ENABLED(CONFIG_NFS_V4_1)
+	init_waitqueue_head(&clp->cl_lock_waitq);
+#endif
 	return clp;
 
 error:
@@ -562,15 +565,15 @@ out:
 /*
  * Returns true if the client IDs match
  */
-static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b)
+static bool nfs4_match_clientids(u64 a, u64 b)
 {
-	if (a->cl_clientid != b->cl_clientid) {
+	if (a != b) {
 		dprintk("NFS: --> %s client ID %llx does not match %llx\n",
-			__func__, a->cl_clientid, b->cl_clientid);
+			__func__, a, b);
 		return false;
 	}
 	dprintk("NFS: --> %s client ID %llx matches %llx\n",
-		__func__, a->cl_clientid, b->cl_clientid);
+		__func__, a, b);
 	return true;
 }
 
@@ -578,17 +581,15 @@ static bool nfs4_match_clientids(struct nfs_client *a, struct nfs_client *b)
  * Returns true if the server major ids match
  */
 static bool
-nfs4_check_clientid_trunking(struct nfs_client *a, struct nfs_client *b)
+nfs4_check_serverowner_major_id(struct nfs41_server_owner *o1,
+				struct nfs41_server_owner *o2)
 {
-	struct nfs41_server_owner *o1 = a->cl_serverowner;
-	struct nfs41_server_owner *o2 = b->cl_serverowner;
-
 	if (o1->major_id_sz != o2->major_id_sz)
 		goto out_major_mismatch;
 	if (memcmp(o1->major_id, o2->major_id, o1->major_id_sz) != 0)
 		goto out_major_mismatch;
 
-	dprintk("NFS: --> %s server owners match\n", __func__);
+	dprintk("NFS: --> %s server owner major IDs match\n", __func__);
 	return true;
 
 out_major_mismatch:
@@ -597,6 +598,100 @@ out_major_mismatch:
 	return false;
 }
 
+/*
+ * Returns true if server minor ids match
+ */
+static bool
+nfs4_check_serverowner_minor_id(struct nfs41_server_owner *o1,
+				struct nfs41_server_owner *o2)
+{
+	/* Check eir_server_owner so_minor_id */
+	if (o1->minor_id != o2->minor_id)
+		goto out_minor_mismatch;
+
+	dprintk("NFS: --> %s server owner minor IDs match\n", __func__);
+	return true;
+
+out_minor_mismatch:
+	dprintk("NFS: --> %s server owner minor IDs do not match\n", __func__);
+	return false;
+}
+
+/*
+ * Returns true if the server scopes match
+ */
+static bool
+nfs4_check_server_scope(struct nfs41_server_scope *s1,
+			struct nfs41_server_scope *s2)
+{
+	if (s1->server_scope_sz != s2->server_scope_sz)
+		goto out_scope_mismatch;
+	if (memcmp(s1->server_scope, s2->server_scope,
+		   s1->server_scope_sz) != 0)
+		goto out_scope_mismatch;
+
+	dprintk("NFS: --> %s server scopes match\n", __func__);
+	return true;
+
+out_scope_mismatch:
+	dprintk("NFS: --> %s server scopes do not match\n",
+		__func__);
+	return false;
+}
+
+/**
+ * nfs4_detect_session_trunking - Checks for session trunking.
+ *
+ * Called after a successful EXCHANGE_ID on a multi-addr connection.
+ * Upon success, add the transport.
+ *
+ * @clp:    original mount nfs_client
+ * @res:    result structure from an exchange_id using the original mount
+ *          nfs_client with a new multi_addr transport
+ *
+ * Returns zero on success, otherwise -EINVAL
+ *
+ * Note: since the exchange_id for the new multi_addr transport uses the
+ * same nfs_client from the original mount, the cl_owner_id is reused,
+ * so eir_clientowner is the same.
+ */
+int nfs4_detect_session_trunking(struct nfs_client *clp,
+				 struct nfs41_exchange_id_res *res,
+				 struct rpc_xprt *xprt)
+{
+	/* Check eir_clientid */
+	if (!nfs4_match_clientids(clp->cl_clientid, res->clientid))
+		goto out_err;
+
+	/* Check eir_server_owner so_major_id */
+	if (!nfs4_check_serverowner_major_id(clp->cl_serverowner,
+					     res->server_owner))
+		goto out_err;
+
+	/* Check eir_server_owner so_minor_id */
+	if (!nfs4_check_serverowner_minor_id(clp->cl_serverowner,
+					     res->server_owner))
+		goto out_err;
+
+	/* Check eir_server_scope */
+	if (!nfs4_check_server_scope(clp->cl_serverscope, res->server_scope))
+		goto out_err;
+
+	/* Session trunking passed, add the xprt */
+	rpc_clnt_xprt_switch_add_xprt(clp->cl_rpcclient, xprt);
+
+	pr_info("NFS:  %s: Session trunking succeeded for %s\n",
+		clp->cl_hostname,
+		xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+	return 0;
+out_err:
+	pr_info("NFS:  %s: Session trunking failed for %s\n", clp->cl_hostname,
+		xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+	return -EINVAL;
+}
+
 /**
  * nfs41_walk_client_list - Find nfs_client that matches a client/server owner
  *
@@ -650,7 +745,7 @@ int nfs41_walk_client_list(struct nfs_client *new,
 		if (pos->cl_cons_state != NFS_CS_READY)
 			continue;
 
-		if (!nfs4_match_clientids(pos, new))
+		if (!nfs4_match_clientids(pos->cl_clientid, new->cl_clientid))
 			continue;
 
 		/*
@@ -658,7 +753,8 @@ int nfs41_walk_client_list(struct nfs_client *new,
 		 * client id trunking. In either case, we want to fall back
 		 * to using the existing nfs_client.
 		 */
-		if (!nfs4_check_clientid_trunking(pos, new))
+		if (!nfs4_check_serverowner_major_id(pos->cl_serverowner,
+						     new->cl_serverowner))
 			continue;
 
 		/* Unlike NFSv4.0, we know that NFSv4.1 always uses the
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index 0e327528a3ce..ad917bd72b38 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -99,8 +99,8 @@ static int nfs4_do_setattr(struct inode *inode, struct rpc_cred *cred,
 #ifdef CONFIG_NFS_V4_1
 static int nfs41_test_stateid(struct nfs_server *, nfs4_stateid *,
 		struct rpc_cred *);
-static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *,
-		struct rpc_cred *);
+static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *,
+		struct rpc_cred *, bool);
 #endif
 
 #ifdef CONFIG_NFS_V4_SECURITY_LABEL
@@ -328,6 +328,33 @@ static void nfs4_setup_readdir(u64 cookie, __be32 *verifier, struct dentry *dent
 	kunmap_atomic(start);
 }
 
+static void nfs4_test_and_free_stateid(struct nfs_server *server,
+		nfs4_stateid *stateid,
+		struct rpc_cred *cred)
+{
+	const struct nfs4_minor_version_ops *ops = server->nfs_client->cl_mvops;
+
+	ops->test_and_free_expired(server, stateid, cred);
+}
+
+static void __nfs4_free_revoked_stateid(struct nfs_server *server,
+		nfs4_stateid *stateid,
+		struct rpc_cred *cred)
+{
+	stateid->type = NFS4_REVOKED_STATEID_TYPE;
+	nfs4_test_and_free_stateid(server, stateid, cred);
+}
+
+static void nfs4_free_revoked_stateid(struct nfs_server *server,
+		const nfs4_stateid *stateid,
+		struct rpc_cred *cred)
+{
+	nfs4_stateid tmp;
+
+	nfs4_stateid_copy(&tmp, stateid);
+	__nfs4_free_revoked_stateid(server, &tmp, cred);
+}
+
 static long nfs4_update_delay(long *timeout)
 {
 	long ret;
@@ -370,13 +397,23 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
 	exception->delay = 0;
 	exception->recovering = 0;
 	exception->retry = 0;
+
+	if (stateid == NULL && state != NULL)
+		stateid = &state->stateid;
+
 	switch(errorcode) {
 		case 0:
 			return 0;
-		case -NFS4ERR_OPENMODE:
 		case -NFS4ERR_DELEG_REVOKED:
 		case -NFS4ERR_ADMIN_REVOKED:
+		case -NFS4ERR_EXPIRED:
 		case -NFS4ERR_BAD_STATEID:
+			if (inode != NULL && stateid != NULL) {
+				nfs_inode_find_state_and_recover(inode,
+						stateid);
+				goto wait_on_recovery;
+			}
+		case -NFS4ERR_OPENMODE:
 			if (inode) {
 				int err;
 
@@ -395,12 +432,6 @@ static int nfs4_do_handle_exception(struct nfs_server *server,
 			if (ret < 0)
 				break;
 			goto wait_on_recovery;
-		case -NFS4ERR_EXPIRED:
-			if (state != NULL) {
-				ret = nfs4_schedule_stateid_recovery(server, state);
-				if (ret < 0)
-					break;
-			}
 		case -NFS4ERR_STALE_STATEID:
 		case -NFS4ERR_STALE_CLIENTID:
 			nfs4_schedule_lease_recovery(clp);
@@ -616,6 +647,7 @@ int nfs40_setup_sequence(struct nfs4_slot_table *tbl,
 	}
 	spin_unlock(&tbl->slot_tbl_lock);
 
+	slot->privileged = args->sa_privileged ? 1 : 0;
 	args->sa_slot = slot;
 	res->sr_slot = slot;
 
@@ -723,12 +755,20 @@ static int nfs41_sequence_process(struct rpc_task *task,
 	/* Check the SEQUENCE operation status */
 	switch (res->sr_status) {
 	case 0:
+		/* If previous op on slot was interrupted and we reused
+		 * the seq# and got a reply from the cache, then retry
+		 */
+		if (task->tk_status == -EREMOTEIO && interrupted) {
+			++slot->seq_nr;
+			goto retry_nowait;
+		}
 		/* Update the slot's sequence and clientid lease timer */
 		slot->seq_done = 1;
 		clp = session->clp;
 		do_renew_lease(clp, res->sr_timestamp);
 		/* Check sequence flags */
-		nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags);
+		nfs41_handle_sequence_flag_errors(clp, res->sr_status_flags,
+				!!slot->privileged);
 		nfs41_update_target_slotid(slot->table, slot, res);
 		break;
 	case 1:
@@ -875,6 +915,7 @@ int nfs41_setup_sequence(struct nfs4_session *session,
 	}
 	spin_unlock(&tbl->slot_tbl_lock);
 
+	slot->privileged = args->sa_privileged ? 1 : 0;
 	args->sa_slot = slot;
 
 	dprintk("<-- %s slotid=%u seqid=%u\n", __func__,
@@ -1353,6 +1394,19 @@ static void update_open_stateflags(struct nfs4_state *state, fmode_t fmode)
 	nfs4_state_set_mode_locked(state, state->state | fmode);
 }
 
+#ifdef CONFIG_NFS_V4_1
+static bool nfs_open_stateid_recover_openmode(struct nfs4_state *state)
+{
+	if (state->n_rdonly && !test_bit(NFS_O_RDONLY_STATE, &state->flags))
+		return true;
+	if (state->n_wronly && !test_bit(NFS_O_WRONLY_STATE, &state->flags))
+		return true;
+	if (state->n_rdwr && !test_bit(NFS_O_RDWR_STATE, &state->flags))
+		return true;
+	return false;
+}
+#endif /* CONFIG_NFS_V4_1 */
+
 static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
 {
 	struct nfs_client *clp = state->owner->so_server->nfs_client;
@@ -1369,11 +1423,12 @@ static void nfs_test_and_clear_all_open_stateid(struct nfs4_state *state)
 }
 
 static bool nfs_need_update_open_stateid(struct nfs4_state *state,
-		nfs4_stateid *stateid)
+		const nfs4_stateid *stateid, nfs4_stateid *freeme)
 {
 	if (test_and_set_bit(NFS_OPEN_STATE, &state->flags) == 0)
 		return true;
 	if (!nfs4_stateid_match_other(stateid, &state->open_stateid)) {
+		nfs4_stateid_copy(freeme, &state->open_stateid);
 		nfs_test_and_clear_all_open_stateid(state);
 		return true;
 	}
@@ -1437,7 +1492,9 @@ static void nfs_clear_open_stateid(struct nfs4_state *state,
 		nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
 }
 
-static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *stateid, fmode_t fmode)
+static void nfs_set_open_stateid_locked(struct nfs4_state *state,
+		const nfs4_stateid *stateid, fmode_t fmode,
+		nfs4_stateid *freeme)
 {
 	switch (fmode) {
 		case FMODE_READ:
@@ -1449,14 +1506,18 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, nfs4_stateid *
 		case FMODE_READ|FMODE_WRITE:
 			set_bit(NFS_O_RDWR_STATE, &state->flags);
 	}
-	if (!nfs_need_update_open_stateid(state, stateid))
+	if (!nfs_need_update_open_stateid(state, stateid, freeme))
 		return;
 	if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)
 		nfs4_stateid_copy(&state->stateid, stateid);
 	nfs4_stateid_copy(&state->open_stateid, stateid);
 }
 
-static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, const nfs4_stateid *deleg_stateid, fmode_t fmode)
+static void __update_open_stateid(struct nfs4_state *state,
+		const nfs4_stateid *open_stateid,
+		const nfs4_stateid *deleg_stateid,
+		fmode_t fmode,
+		nfs4_stateid *freeme)
 {
 	/*
 	 * Protect the call to nfs4_state_set_mode_locked and
@@ -1469,16 +1530,22 @@ static void __update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_s
 		set_bit(NFS_DELEGATED_STATE, &state->flags);
 	}
 	if (open_stateid != NULL)
-		nfs_set_open_stateid_locked(state, open_stateid, fmode);
+		nfs_set_open_stateid_locked(state, open_stateid, fmode, freeme);
 	write_sequnlock(&state->seqlock);
 	update_open_stateflags(state, fmode);
 	spin_unlock(&state->owner->so_lock);
 }
 
-static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stateid, nfs4_stateid *delegation, fmode_t fmode)
+static int update_open_stateid(struct nfs4_state *state,
+		const nfs4_stateid *open_stateid,
+		const nfs4_stateid *delegation,
+		fmode_t fmode)
 {
+	struct nfs_server *server = NFS_SERVER(state->inode);
+	struct nfs_client *clp = server->nfs_client;
 	struct nfs_inode *nfsi = NFS_I(state->inode);
 	struct nfs_delegation *deleg_cur;
+	nfs4_stateid freeme = {0};
 	int ret = 0;
 
 	fmode &= (FMODE_READ|FMODE_WRITE);
@@ -1500,7 +1567,8 @@ static int update_open_stateid(struct nfs4_state *state, nfs4_stateid *open_stat
 		goto no_delegation_unlock;
 
 	nfs_mark_delegation_referenced(deleg_cur);
-	__update_open_stateid(state, open_stateid, &deleg_cur->stateid, fmode);
+	__update_open_stateid(state, open_stateid, &deleg_cur->stateid,
+			fmode, &freeme);
 	ret = 1;
 no_delegation_unlock:
 	spin_unlock(&deleg_cur->lock);
@@ -1508,11 +1576,14 @@ no_delegation:
 	rcu_read_unlock();
 
 	if (!ret && open_stateid != NULL) {
-		__update_open_stateid(state, open_stateid, NULL, fmode);
+		__update_open_stateid(state, open_stateid, NULL, fmode, &freeme);
 		ret = 1;
 	}
 	if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags))
-		nfs4_schedule_state_manager(state->owner->so_server->nfs_client);
+		nfs4_schedule_state_manager(clp);
+	if (freeme.type != 0)
+		nfs4_test_and_free_stateid(server, &freeme,
+				state->owner->so_cred);
 
 	return ret;
 }
@@ -1889,7 +1960,6 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
 		case -NFS4ERR_STALE_CLIENTID:
 		case -NFS4ERR_STALE_STATEID:
 			set_bit(NFS_DELEGATED_STATE, &state->flags);
-		case -NFS4ERR_EXPIRED:
 			/* Don't recall a delegation if it was lost */
 			nfs4_schedule_lease_recovery(server->nfs_client);
 			return -EAGAIN;
@@ -1901,6 +1971,7 @@ static int nfs4_handle_delegation_recall_error(struct nfs_server *server, struct
 			return -EAGAIN;
 		case -NFS4ERR_DELEG_REVOKED:
 		case -NFS4ERR_ADMIN_REVOKED:
+		case -NFS4ERR_EXPIRED:
 		case -NFS4ERR_BAD_STATEID:
 		case -NFS4ERR_OPENMODE:
 			nfs_inode_find_state_and_recover(state->inode,
@@ -2382,9 +2453,10 @@ static int nfs4_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *sta
 	return ret;
 }
 
-static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state)
+static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state,
+		const nfs4_stateid *stateid)
 {
-	nfs_remove_bad_delegation(state->inode);
+	nfs_remove_bad_delegation(state->inode, stateid);
 	write_seqlock(&state->seqlock);
 	nfs4_stateid_copy(&state->stateid, &state->open_stateid);
 	write_sequnlock(&state->seqlock);
@@ -2394,7 +2466,7 @@ static void nfs_finish_clear_delegation_stateid(struct nfs4_state *state)
 static void nfs40_clear_delegation_stateid(struct nfs4_state *state)
 {
 	if (rcu_access_pointer(NFS_I(state->inode)->delegation) != NULL)
-		nfs_finish_clear_delegation_stateid(state);
+		nfs_finish_clear_delegation_stateid(state, NULL);
 }
 
 static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
@@ -2404,7 +2476,45 @@ static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
 	return nfs4_open_expired(sp, state);
 }
 
+static int nfs40_test_and_free_expired_stateid(struct nfs_server *server,
+		nfs4_stateid *stateid,
+		struct rpc_cred *cred)
+{
+	return -NFS4ERR_BAD_STATEID;
+}
+
 #if defined(CONFIG_NFS_V4_1)
+static int nfs41_test_and_free_expired_stateid(struct nfs_server *server,
+		nfs4_stateid *stateid,
+		struct rpc_cred *cred)
+{
+	int status;
+
+	switch (stateid->type) {
+	default:
+		break;
+	case NFS4_INVALID_STATEID_TYPE:
+	case NFS4_SPECIAL_STATEID_TYPE:
+		return -NFS4ERR_BAD_STATEID;
+	case NFS4_REVOKED_STATEID_TYPE:
+		goto out_free;
+	}
+
+	status = nfs41_test_stateid(server, stateid, cred);
+	switch (status) {
+	case -NFS4ERR_EXPIRED:
+	case -NFS4ERR_ADMIN_REVOKED:
+	case -NFS4ERR_DELEG_REVOKED:
+		break;
+	default:
+		return status;
+	}
+out_free:
+	/* Ack the revoked state to the server */
+	nfs41_free_stateid(server, stateid, cred, true);
+	return -NFS4ERR_EXPIRED;
+}
+
 static void nfs41_check_delegation_stateid(struct nfs4_state *state)
 {
 	struct nfs_server *server = NFS_SERVER(state->inode);
@@ -2422,23 +2532,68 @@ static void nfs41_check_delegation_stateid(struct nfs4_state *state)
 	}
 
 	nfs4_stateid_copy(&stateid, &delegation->stateid);
+	if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+		rcu_read_unlock();
+		nfs_finish_clear_delegation_stateid(state, &stateid);
+		return;
+	}
+
+	if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags)) {
+		rcu_read_unlock();
+		return;
+	}
+
 	cred = get_rpccred(delegation->cred);
 	rcu_read_unlock();
-	status = nfs41_test_stateid(server, &stateid, cred);
+	status = nfs41_test_and_free_expired_stateid(server, &stateid, cred);
 	trace_nfs4_test_delegation_stateid(state, NULL, status);
-
-	if (status != NFS_OK) {
-		/* Free the stateid unless the server explicitly
-		 * informs us the stateid is unrecognized. */
-		if (status != -NFS4ERR_BAD_STATEID)
-			nfs41_free_stateid(server, &stateid, cred);
-		nfs_finish_clear_delegation_stateid(state);
-	}
+	if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID)
+		nfs_finish_clear_delegation_stateid(state, &stateid);
 
 	put_rpccred(cred);
 }
 
 /**
+ * nfs41_check_expired_locks - possibly free a lock stateid
+ *
+ * @state: NFSv4 state for an inode
+ *
+ * Returns NFS_OK if recovery for this stateid is now finished.
+ * Otherwise a negative NFS4ERR value is returned.
+ */
+static int nfs41_check_expired_locks(struct nfs4_state *state)
+{
+	int status, ret = NFS_OK;
+	struct nfs4_lock_state *lsp;
+	struct nfs_server *server = NFS_SERVER(state->inode);
+
+	if (!test_bit(LK_STATE_IN_USE, &state->flags))
+		goto out;
+	list_for_each_entry(lsp, &state->lock_states, ls_locks) {
+		if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
+			struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
+
+			status = nfs41_test_and_free_expired_stateid(server,
+					&lsp->ls_stateid,
+					cred);
+			trace_nfs4_test_lock_stateid(state, lsp, status);
+			if (status == -NFS4ERR_EXPIRED ||
+			    status == -NFS4ERR_BAD_STATEID) {
+				clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
+				lsp->ls_stateid.type = NFS4_INVALID_STATEID_TYPE;
+				if (!recover_lost_locks)
+					set_bit(NFS_LOCK_LOST, &lsp->ls_flags);
+			} else if (status != NFS_OK) {
+				ret = status;
+				break;
+			}
+		}
+	};
+out:
+	return ret;
+}
+
+/**
  * nfs41_check_open_stateid - possibly free an open stateid
  *
  * @state: NFSv4 state for an inode
@@ -2453,26 +2608,28 @@ static int nfs41_check_open_stateid(struct nfs4_state *state)
 	struct rpc_cred *cred = state->owner->so_cred;
 	int status;
 
-	/* If a state reset has been done, test_stateid is unneeded */
-	if ((test_bit(NFS_O_RDONLY_STATE, &state->flags) == 0) &&
-	    (test_bit(NFS_O_WRONLY_STATE, &state->flags) == 0) &&
-	    (test_bit(NFS_O_RDWR_STATE, &state->flags) == 0))
+	if (test_bit(NFS_OPEN_STATE, &state->flags) == 0) {
+		if (test_bit(NFS_DELEGATED_STATE, &state->flags) == 0)  {
+			if (nfs4_have_delegation(state->inode, state->state))
+				return NFS_OK;
+			return -NFS4ERR_OPENMODE;
+		}
 		return -NFS4ERR_BAD_STATEID;
-
-	status = nfs41_test_stateid(server, stateid, cred);
+	}
+	status = nfs41_test_and_free_expired_stateid(server, stateid, cred);
 	trace_nfs4_test_open_stateid(state, NULL, status);
-	if (status != NFS_OK) {
-		/* Free the stateid unless the server explicitly
-		 * informs us the stateid is unrecognized. */
-		if (status != -NFS4ERR_BAD_STATEID)
-			nfs41_free_stateid(server, stateid, cred);
-
+	if (status == -NFS4ERR_EXPIRED || status == -NFS4ERR_BAD_STATEID) {
 		clear_bit(NFS_O_RDONLY_STATE, &state->flags);
 		clear_bit(NFS_O_WRONLY_STATE, &state->flags);
 		clear_bit(NFS_O_RDWR_STATE, &state->flags);
 		clear_bit(NFS_OPEN_STATE, &state->flags);
+		stateid->type = NFS4_INVALID_STATEID_TYPE;
 	}
-	return status;
+	if (status != NFS_OK)
+		return status;
+	if (nfs_open_stateid_recover_openmode(state))
+		return -NFS4ERR_OPENMODE;
+	return NFS_OK;
 }
 
 static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *state)
@@ -2480,6 +2637,9 @@ static int nfs41_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st
 	int status;
 
 	nfs41_check_delegation_stateid(state);
+	status = nfs41_check_expired_locks(state);
+	if (status != NFS_OK)
+		return status;
 	status = nfs41_check_open_stateid(state);
 	if (status != NFS_OK)
 		status = nfs4_open_expired(sp, state);
@@ -2537,6 +2697,8 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata,
 		goto out;
 	if (server->caps & NFS_CAP_POSIX_LOCK)
 		set_bit(NFS_STATE_POSIX_LOCKS, &state->flags);
+	if (opendata->o_res.rflags & NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK)
+		set_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags);
 
 	dentry = opendata->dentry;
 	if (d_really_is_negative(dentry)) {
@@ -2899,9 +3061,12 @@ static void nfs4_close_done(struct rpc_task *task, void *data)
 			break;
 		case -NFS4ERR_ADMIN_REVOKED:
 		case -NFS4ERR_STALE_STATEID:
+		case -NFS4ERR_EXPIRED:
+			nfs4_free_revoked_stateid(server,
+					&calldata->arg.stateid,
+					task->tk_msg.rpc_cred);
 		case -NFS4ERR_OLD_STATEID:
 		case -NFS4ERR_BAD_STATEID:
-		case -NFS4ERR_EXPIRED:
 			if (!nfs4_stateid_match(&calldata->arg.stateid,
 						&state->open_stateid)) {
 				rpc_restart_call_prepare(task);
@@ -4312,7 +4477,7 @@ static int nfs4_proc_fsinfo(struct nfs_server *server, struct nfs_fh *fhandle, s
 	if (error == 0) {
 		/* block layout checks this! */
 		server->pnfs_blksize = fsinfo->blksize;
-		set_pnfs_layoutdriver(server, fhandle, fsinfo->layouttype);
+		set_pnfs_layoutdriver(server, fhandle, fsinfo);
 	}
 
 	return error;
@@ -4399,24 +4564,25 @@ static bool nfs4_error_stateid_expired(int err)
 	return false;
 }
 
-void __nfs4_read_done_cb(struct nfs_pgio_header *hdr)
-{
-	nfs_invalidate_atime(hdr->inode);
-}
-
 static int nfs4_read_done_cb(struct rpc_task *task, struct nfs_pgio_header *hdr)
 {
 	struct nfs_server *server = NFS_SERVER(hdr->inode);
 
 	trace_nfs4_read(hdr, task->tk_status);
-	if (nfs4_async_handle_error(task, server,
-				    hdr->args.context->state,
-				    NULL) == -EAGAIN) {
-		rpc_restart_call_prepare(task);
-		return -EAGAIN;
+	if (task->tk_status < 0) {
+		struct nfs4_exception exception = {
+			.inode = hdr->inode,
+			.state = hdr->args.context->state,
+			.stateid = &hdr->args.stateid,
+		};
+		task->tk_status = nfs4_async_handle_exception(task,
+				server, task->tk_status, &exception);
+		if (exception.retry) {
+			rpc_restart_call_prepare(task);
+			return -EAGAIN;
+		}
 	}
 
-	__nfs4_read_done_cb(hdr);
 	if (task->tk_status > 0)
 		renew_lease(server, hdr->timestamp);
 	return 0;
@@ -4445,6 +4611,8 @@ static int nfs4_read_done(struct rpc_task *task, struct nfs_pgio_header *hdr)
 		return -EAGAIN;
 	if (nfs4_read_stateid_changed(task, &hdr->args))
 		return -EAGAIN;
+	if (task->tk_status > 0)
+		nfs_invalidate_atime(hdr->inode);
 	return hdr->pgio_done_cb ? hdr->pgio_done_cb(task, hdr) :
 				    nfs4_read_done_cb(task, hdr);
 }
@@ -4482,11 +4650,19 @@ static int nfs4_write_done_cb(struct rpc_task *task,
 	struct inode *inode = hdr->inode;
 
 	trace_nfs4_write(hdr, task->tk_status);
-	if (nfs4_async_handle_error(task, NFS_SERVER(inode),
-				    hdr->args.context->state,
-				    NULL) == -EAGAIN) {
-		rpc_restart_call_prepare(task);
-		return -EAGAIN;
+	if (task->tk_status < 0) {
+		struct nfs4_exception exception = {
+			.inode = hdr->inode,
+			.state = hdr->args.context->state,
+			.stateid = &hdr->args.stateid,
+		};
+		task->tk_status = nfs4_async_handle_exception(task,
+				NFS_SERVER(inode), task->tk_status,
+				&exception);
+		if (exception.retry) {
+			rpc_restart_call_prepare(task);
+			return -EAGAIN;
+		}
 	}
 	if (task->tk_status >= 0) {
 		renew_lease(NFS_SERVER(inode), hdr->timestamp);
@@ -5123,12 +5299,14 @@ static void nfs4_init_boot_verifier(const struct nfs_client *clp,
 	if (test_bit(NFS4CLNT_PURGE_STATE, &clp->cl_state)) {
 		/* An impossible timestamp guarantees this value
 		 * will never match a generated boot time. */
-		verf[0] = 0;
-		verf[1] = cpu_to_be32(NSEC_PER_SEC + 1);
+		verf[0] = cpu_to_be32(U32_MAX);
+		verf[1] = cpu_to_be32(U32_MAX);
 	} else {
 		struct nfs_net *nn = net_generic(clp->cl_net, nfs_net_id);
-		verf[0] = cpu_to_be32(nn->boot_time.tv_sec);
-		verf[1] = cpu_to_be32(nn->boot_time.tv_nsec);
+		u64 ns = ktime_to_ns(nn->boot_time);
+
+		verf[0] = cpu_to_be32(ns >> 32);
+		verf[1] = cpu_to_be32(ns);
 	}
 	memcpy(bootverf->data, verf, sizeof(bootverf->data));
 }
@@ -5393,10 +5571,13 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
 		renew_lease(data->res.server, data->timestamp);
 	case -NFS4ERR_ADMIN_REVOKED:
 	case -NFS4ERR_DELEG_REVOKED:
+	case -NFS4ERR_EXPIRED:
+		nfs4_free_revoked_stateid(data->res.server,
+				data->args.stateid,
+				task->tk_msg.rpc_cred);
 	case -NFS4ERR_BAD_STATEID:
 	case -NFS4ERR_OLD_STATEID:
 	case -NFS4ERR_STALE_STATEID:
-	case -NFS4ERR_EXPIRED:
 		task->tk_status = 0;
 		if (data->roc)
 			pnfs_roc_set_barrier(data->inode, data->roc_barrier);
@@ -5528,22 +5709,6 @@ int nfs4_proc_delegreturn(struct inode *inode, struct rpc_cred *cred, const nfs4
 	return err;
 }
 
-#define NFS4_LOCK_MINTIMEOUT (1 * HZ)
-#define NFS4_LOCK_MAXTIMEOUT (30 * HZ)
-
-/* 
- * sleep, with exponential backoff, and retry the LOCK operation. 
- */
-static unsigned long
-nfs4_set_lock_task_retry(unsigned long timeout)
-{
-	freezable_schedule_timeout_killable_unsafe(timeout);
-	timeout <<= 1;
-	if (timeout > NFS4_LOCK_MAXTIMEOUT)
-		return NFS4_LOCK_MAXTIMEOUT;
-	return timeout;
-}
-
 static int _nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *request)
 {
 	struct inode *inode = state->inode;
@@ -5600,11 +5765,6 @@ static int nfs4_proc_getlk(struct nfs4_state *state, int cmd, struct file_lock *
 	return err;
 }
 
-static int do_vfs_lock(struct inode *inode, struct file_lock *fl)
-{
-	return locks_lock_inode_wait(inode, fl);
-}
-
 struct nfs4_unlockdata {
 	struct nfs_locku_args arg;
 	struct nfs_locku_res res;
@@ -5657,14 +5817,18 @@ static void nfs4_locku_done(struct rpc_task *task, void *data)
 	switch (task->tk_status) {
 		case 0:
 			renew_lease(calldata->server, calldata->timestamp);
-			do_vfs_lock(calldata->lsp->ls_state->inode, &calldata->fl);
+			locks_lock_inode_wait(calldata->lsp->ls_state->inode, &calldata->fl);
 			if (nfs4_update_lock_stateid(calldata->lsp,
 					&calldata->res.stateid))
 				break;
+		case -NFS4ERR_ADMIN_REVOKED:
+		case -NFS4ERR_EXPIRED:
+			nfs4_free_revoked_stateid(calldata->server,
+					&calldata->arg.stateid,
+					task->tk_msg.rpc_cred);
 		case -NFS4ERR_BAD_STATEID:
 		case -NFS4ERR_OLD_STATEID:
 		case -NFS4ERR_STALE_STATEID:
-		case -NFS4ERR_EXPIRED:
 			if (!nfs4_stateid_match(&calldata->arg.stateid,
 						&calldata->lsp->ls_stateid))
 				rpc_restart_call_prepare(task);
@@ -5765,7 +5929,7 @@ static int nfs4_proc_unlck(struct nfs4_state *state, int cmd, struct file_lock *
 	mutex_lock(&sp->so_delegreturn_mutex);
 	/* Exclude nfs4_reclaim_open_stateid() - note nesting! */
 	down_read(&nfsi->rwsem);
-	if (do_vfs_lock(inode, request) == -ENOENT) {
+	if (locks_lock_inode_wait(inode, request) == -ENOENT) {
 		up_read(&nfsi->rwsem);
 		mutex_unlock(&sp->so_delegreturn_mutex);
 		goto out;
@@ -5906,7 +6070,7 @@ static void nfs4_lock_done(struct rpc_task *task, void *calldata)
 				data->timestamp);
 		if (data->arg.new_lock) {
 			data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
-			if (do_vfs_lock(lsp->ls_state->inode, &data->fl) < 0) {
+			if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0) {
 				rpc_restart_call_prepare(task);
 				break;
 			}
@@ -5965,6 +6129,7 @@ static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_
 {
 	switch (error) {
 	case -NFS4ERR_ADMIN_REVOKED:
+	case -NFS4ERR_EXPIRED:
 	case -NFS4ERR_BAD_STATEID:
 		lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
 		if (new_lock_owner != 0 ||
@@ -5973,7 +6138,6 @@ static void nfs4_handle_setlk_error(struct nfs_server *server, struct nfs4_lock_
 		break;
 	case -NFS4ERR_STALE_STATEID:
 		lsp->ls_seqid.flags &= ~NFS_SEQID_CONFIRMED;
-	case -NFS4ERR_EXPIRED:
 		nfs4_schedule_lease_recovery(server->nfs_client);
 	};
 }
@@ -6083,52 +6247,19 @@ out:
 }
 
 #if defined(CONFIG_NFS_V4_1)
-/**
- * nfs41_check_expired_locks - possibly free a lock stateid
- *
- * @state: NFSv4 state for an inode
- *
- * Returns NFS_OK if recovery for this stateid is now finished.
- * Otherwise a negative NFS4ERR value is returned.
- */
-static int nfs41_check_expired_locks(struct nfs4_state *state)
-{
-	int status, ret = -NFS4ERR_BAD_STATEID;
-	struct nfs4_lock_state *lsp;
-	struct nfs_server *server = NFS_SERVER(state->inode);
-
-	list_for_each_entry(lsp, &state->lock_states, ls_locks) {
-		if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags)) {
-			struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
-
-			status = nfs41_test_stateid(server,
-					&lsp->ls_stateid,
-					cred);
-			trace_nfs4_test_lock_stateid(state, lsp, status);
-			if (status != NFS_OK) {
-				/* Free the stateid unless the server
-				 * informs us the stateid is unrecognized. */
-				if (status != -NFS4ERR_BAD_STATEID)
-					nfs41_free_stateid(server,
-							&lsp->ls_stateid,
-							cred);
-				clear_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags);
-				ret = status;
-			}
-		}
-	};
-
-	return ret;
-}
-
 static int nfs41_lock_expired(struct nfs4_state *state, struct file_lock *request)
 {
-	int status = NFS_OK;
+	struct nfs4_lock_state *lsp;
+	int status;
 
-	if (test_bit(LK_STATE_IN_USE, &state->flags))
-		status = nfs41_check_expired_locks(state);
-	if (status != NFS_OK)
-		status = nfs4_lock_expired(state, request);
+	status = nfs4_set_lock_state(state, request);
+	if (status != 0)
+		return status;
+	lsp = request->fl_u.nfs4_fl.owner;
+	if (test_bit(NFS_LOCK_INITIALIZED, &lsp->ls_flags) ||
+	    test_bit(NFS_LOCK_LOST, &lsp->ls_flags))
+		return 0;
+	status = nfs4_lock_expired(state, request);
 	return status;
 }
 #endif
@@ -6138,17 +6269,10 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
 	struct nfs_inode *nfsi = NFS_I(state->inode);
 	struct nfs4_state_owner *sp = state->owner;
 	unsigned char fl_flags = request->fl_flags;
-	int status = -ENOLCK;
+	int status;
 
-	if ((fl_flags & FL_POSIX) &&
-			!test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
-		goto out;
-	/* Is this a delegated open? */
-	status = nfs4_set_lock_state(state, request);
-	if (status != 0)
-		goto out;
 	request->fl_flags |= FL_ACCESS;
-	status = do_vfs_lock(state->inode, request);
+	status = locks_lock_inode_wait(state->inode, request);
 	if (status < 0)
 		goto out;
 	mutex_lock(&sp->so_delegreturn_mutex);
@@ -6157,7 +6281,7 @@ static int _nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock
 		/* Yes: cache locks! */
 		/* ...but avoid races with delegation recall... */
 		request->fl_flags = fl_flags & ~FL_SLEEP;
-		status = do_vfs_lock(state->inode, request);
+		status = locks_lock_inode_wait(state->inode, request);
 		up_read(&nfsi->rwsem);
 		mutex_unlock(&sp->so_delegreturn_mutex);
 		goto out;
@@ -6188,12 +6312,124 @@ static int nfs4_proc_setlk(struct nfs4_state *state, int cmd, struct file_lock *
 	return err;
 }
 
+#define NFS4_LOCK_MINTIMEOUT (1 * HZ)
+#define NFS4_LOCK_MAXTIMEOUT (30 * HZ)
+
+static int
+nfs4_retry_setlk_simple(struct nfs4_state *state, int cmd,
+			struct file_lock *request)
+{
+	int		status = -ERESTARTSYS;
+	unsigned long	timeout = NFS4_LOCK_MINTIMEOUT;
+
+	while(!signalled()) {
+		status = nfs4_proc_setlk(state, cmd, request);
+		if ((status != -EAGAIN) || IS_SETLK(cmd))
+			break;
+		freezable_schedule_timeout_interruptible(timeout);
+		timeout *= 2;
+		timeout = min_t(unsigned long, NFS4_LOCK_MAXTIMEOUT, timeout);
+		status = -ERESTARTSYS;
+	}
+	return status;
+}
+
+#ifdef CONFIG_NFS_V4_1
+struct nfs4_lock_waiter {
+	struct task_struct	*task;
+	struct inode		*inode;
+	struct nfs_lowner	*owner;
+	bool			notified;
+};
+
+static int
+nfs4_wake_lock_waiter(wait_queue_t *wait, unsigned int mode, int flags, void *key)
+{
+	int ret;
+	struct cb_notify_lock_args *cbnl = key;
+	struct nfs4_lock_waiter	*waiter	= wait->private;
+	struct nfs_lowner	*lowner = &cbnl->cbnl_owner,
+				*wowner = waiter->owner;
+
+	/* Only wake if the callback was for the same owner */
+	if (lowner->clientid != wowner->clientid ||
+	    lowner->id != wowner->id		 ||
+	    lowner->s_dev != wowner->s_dev)
+		return 0;
+
+	/* Make sure it's for the right inode */
+	if (nfs_compare_fh(NFS_FH(waiter->inode), &cbnl->cbnl_fh))
+		return 0;
+
+	waiter->notified = true;
+
+	/* override "private" so we can use default_wake_function */
+	wait->private = waiter->task;
+	ret = autoremove_wake_function(wait, mode, flags, key);
+	wait->private = waiter;
+	return ret;
+}
+
+static int
+nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+	int status = -ERESTARTSYS;
+	unsigned long flags;
+	struct nfs4_lock_state *lsp = request->fl_u.nfs4_fl.owner;
+	struct nfs_server *server = NFS_SERVER(state->inode);
+	struct nfs_client *clp = server->nfs_client;
+	wait_queue_head_t *q = &clp->cl_lock_waitq;
+	struct nfs_lowner owner = { .clientid = clp->cl_clientid,
+				    .id = lsp->ls_seqid.owner_id,
+				    .s_dev = server->s_dev };
+	struct nfs4_lock_waiter waiter = { .task  = current,
+					   .inode = state->inode,
+					   .owner = &owner,
+					   .notified = false };
+	wait_queue_t wait;
+
+	/* Don't bother with waitqueue if we don't expect a callback */
+	if (!test_bit(NFS_STATE_MAY_NOTIFY_LOCK, &state->flags))
+		return nfs4_retry_setlk_simple(state, cmd, request);
+
+	init_wait(&wait);
+	wait.private = &waiter;
+	wait.func = nfs4_wake_lock_waiter;
+	add_wait_queue(q, &wait);
+
+	while(!signalled()) {
+		status = nfs4_proc_setlk(state, cmd, request);
+		if ((status != -EAGAIN) || IS_SETLK(cmd))
+			break;
+
+		status = -ERESTARTSYS;
+		spin_lock_irqsave(&q->lock, flags);
+		if (waiter.notified) {
+			spin_unlock_irqrestore(&q->lock, flags);
+			continue;
+		}
+		set_current_state(TASK_INTERRUPTIBLE);
+		spin_unlock_irqrestore(&q->lock, flags);
+
+		freezable_schedule_timeout_interruptible(NFS4_LOCK_MAXTIMEOUT);
+	}
+
+	finish_wait(q, &wait);
+	return status;
+}
+#else /* !CONFIG_NFS_V4_1 */
+static inline int
+nfs4_retry_setlk(struct nfs4_state *state, int cmd, struct file_lock *request)
+{
+	return nfs4_retry_setlk_simple(state, cmd, request);
+}
+#endif
+
 static int
 nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
 {
 	struct nfs_open_context *ctx;
 	struct nfs4_state *state;
-	unsigned long timeout = NFS4_LOCK_MINTIMEOUT;
 	int status;
 
 	/* verify open state */
@@ -6220,6 +6456,11 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
 
 	if (state == NULL)
 		return -ENOLCK;
+
+	if ((request->fl_flags & FL_POSIX) &&
+	    !test_bit(NFS_STATE_POSIX_LOCKS, &state->flags))
+		return -ENOLCK;
+
 	/*
 	 * Don't rely on the VFS having checked the file open mode,
 	 * since it won't do this for flock() locks.
@@ -6234,16 +6475,11 @@ nfs4_proc_lock(struct file *filp, int cmd, struct file_lock *request)
 			return -EBADF;
 	}
 
-	do {
-		status = nfs4_proc_setlk(state, cmd, request);
-		if ((status != -EAGAIN) || IS_SETLK(cmd))
-			break;
-		timeout = nfs4_set_lock_task_retry(timeout);
-		status = -ERESTARTSYS;
-		if (signalled())
-			break;
-	} while(status < 0);
-	return status;
+	status = nfs4_set_lock_state(state, request);
+	if (status != 0)
+		return status;
+
+	return nfs4_retry_setlk(state, cmd, request);
 }
 
 int nfs4_lock_delegation_recall(struct file_lock *fl, struct nfs4_state *state, const nfs4_stateid *stateid)
@@ -7104,75 +7340,161 @@ static int nfs4_sp4_select_mode(struct nfs_client *clp,
 	return 0;
 }
 
+struct nfs41_exchange_id_data {
+	struct nfs41_exchange_id_res res;
+	struct nfs41_exchange_id_args args;
+	struct rpc_xprt *xprt;
+	int rpc_status;
+};
+
+static void nfs4_exchange_id_done(struct rpc_task *task, void *data)
+{
+	struct nfs41_exchange_id_data *cdata =
+					(struct nfs41_exchange_id_data *)data;
+	struct nfs_client *clp = cdata->args.client;
+	int status = task->tk_status;
+
+	trace_nfs4_exchange_id(clp, status);
+
+	if (status == 0)
+		status = nfs4_check_cl_exchange_flags(cdata->res.flags);
+
+	if (cdata->xprt && status == 0) {
+		status = nfs4_detect_session_trunking(clp, &cdata->res,
+						      cdata->xprt);
+		goto out;
+	}
+
+	if (status  == 0)
+		status = nfs4_sp4_select_mode(clp, &cdata->res.state_protect);
+
+	if (status == 0) {
+		clp->cl_clientid = cdata->res.clientid;
+		clp->cl_exchange_flags = cdata->res.flags;
+		/* Client ID is not confirmed */
+		if (!(cdata->res.flags & EXCHGID4_FLAG_CONFIRMED_R)) {
+			clear_bit(NFS4_SESSION_ESTABLISHED,
+			&clp->cl_session->session_state);
+			clp->cl_seqid = cdata->res.seqid;
+		}
+
+		kfree(clp->cl_serverowner);
+		clp->cl_serverowner = cdata->res.server_owner;
+		cdata->res.server_owner = NULL;
+
+		/* use the most recent implementation id */
+		kfree(clp->cl_implid);
+		clp->cl_implid = cdata->res.impl_id;
+		cdata->res.impl_id = NULL;
+
+		if (clp->cl_serverscope != NULL &&
+		    !nfs41_same_server_scope(clp->cl_serverscope,
+					cdata->res.server_scope)) {
+			dprintk("%s: server_scope mismatch detected\n",
+				__func__);
+			set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
+			kfree(clp->cl_serverscope);
+			clp->cl_serverscope = NULL;
+		}
+
+		if (clp->cl_serverscope == NULL) {
+			clp->cl_serverscope = cdata->res.server_scope;
+			cdata->res.server_scope = NULL;
+		}
+		/* Save the EXCHANGE_ID verifier session trunk tests */
+		memcpy(clp->cl_confirm.data, cdata->args.verifier->data,
+		       sizeof(clp->cl_confirm.data));
+	}
+out:
+	cdata->rpc_status = status;
+	return;
+}
+
+static void nfs4_exchange_id_release(void *data)
+{
+	struct nfs41_exchange_id_data *cdata =
+					(struct nfs41_exchange_id_data *)data;
+
+	nfs_put_client(cdata->args.client);
+	if (cdata->xprt) {
+		xprt_put(cdata->xprt);
+		rpc_clnt_xprt_switch_put(cdata->args.client->cl_rpcclient);
+	}
+	kfree(cdata->res.impl_id);
+	kfree(cdata->res.server_scope);
+	kfree(cdata->res.server_owner);
+	kfree(cdata);
+}
+
+static const struct rpc_call_ops nfs4_exchange_id_call_ops = {
+	.rpc_call_done = nfs4_exchange_id_done,
+	.rpc_release = nfs4_exchange_id_release,
+};
+
 /*
  * _nfs4_proc_exchange_id()
  *
  * Wrapper for EXCHANGE_ID operation.
  */
 static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
-	u32 sp4_how)
+			u32 sp4_how, struct rpc_xprt *xprt)
 {
 	nfs4_verifier verifier;
-	struct nfs41_exchange_id_args args = {
-		.verifier = &verifier,
-		.client = clp,
-#ifdef CONFIG_NFS_V4_1_MIGRATION
-		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
-			 EXCHGID4_FLAG_BIND_PRINC_STATEID |
-			 EXCHGID4_FLAG_SUPP_MOVED_MIGR,
-#else
-		.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
-			 EXCHGID4_FLAG_BIND_PRINC_STATEID,
-#endif
-	};
-	struct nfs41_exchange_id_res res = {
-		0
-	};
-	int status;
 	struct rpc_message msg = {
 		.rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_EXCHANGE_ID],
-		.rpc_argp = &args,
-		.rpc_resp = &res,
 		.rpc_cred = cred,
 	};
+	struct rpc_task_setup task_setup_data = {
+		.rpc_client = clp->cl_rpcclient,
+		.callback_ops = &nfs4_exchange_id_call_ops,
+		.rpc_message = &msg,
+		.flags = RPC_TASK_ASYNC | RPC_TASK_TIMEOUT,
+	};
+	struct nfs41_exchange_id_data *calldata;
+	struct rpc_task *task;
+	int status = -EIO;
+
+	if (!atomic_inc_not_zero(&clp->cl_count))
+		goto out;
+
+	status = -ENOMEM;
+	calldata = kzalloc(sizeof(*calldata), GFP_NOFS);
+	if (!calldata)
+		goto out;
 
-	nfs4_init_boot_verifier(clp, &verifier);
+	if (!xprt)
+		nfs4_init_boot_verifier(clp, &verifier);
 
 	status = nfs4_init_uniform_client_string(clp);
 	if (status)
-		goto out;
+		goto out_calldata;
 
 	dprintk("NFS call  exchange_id auth=%s, '%s'\n",
 		clp->cl_rpcclient->cl_auth->au_ops->au_name,
 		clp->cl_owner_id);
 
-	res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
-					GFP_NOFS);
-	if (unlikely(res.server_owner == NULL)) {
-		status = -ENOMEM;
-		goto out;
-	}
+	calldata->res.server_owner = kzalloc(sizeof(struct nfs41_server_owner),
+						GFP_NOFS);
+	status = -ENOMEM;
+	if (unlikely(calldata->res.server_owner == NULL))
+		goto out_calldata;
 
-	res.server_scope = kzalloc(sizeof(struct nfs41_server_scope),
+	calldata->res.server_scope = kzalloc(sizeof(struct nfs41_server_scope),
 					GFP_NOFS);
-	if (unlikely(res.server_scope == NULL)) {
-		status = -ENOMEM;
+	if (unlikely(calldata->res.server_scope == NULL))
 		goto out_server_owner;
-	}
 
-	res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS);
-	if (unlikely(res.impl_id == NULL)) {
-		status = -ENOMEM;
+	calldata->res.impl_id = kzalloc(sizeof(struct nfs41_impl_id), GFP_NOFS);
+	if (unlikely(calldata->res.impl_id == NULL))
 		goto out_server_scope;
-	}
 
 	switch (sp4_how) {
 	case SP4_NONE:
-		args.state_protect.how = SP4_NONE;
+		calldata->args.state_protect.how = SP4_NONE;
 		break;
 
 	case SP4_MACH_CRED:
-		args.state_protect = nfs4_sp4_mach_cred_request;
+		calldata->args.state_protect = nfs4_sp4_mach_cred_request;
 		break;
 
 	default:
@@ -7181,56 +7503,42 @@ static int _nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred,
 		status = -EINVAL;
 		goto out_impl_id;
 	}
+	if (xprt) {
+		calldata->xprt = xprt;
+		task_setup_data.rpc_xprt = xprt;
+		task_setup_data.flags =
+				RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC;
+		calldata->args.verifier = &clp->cl_confirm;
+	} else {
+		calldata->args.verifier = &verifier;
+	}
+	calldata->args.client = clp;
+#ifdef CONFIG_NFS_V4_1_MIGRATION
+	calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+	EXCHGID4_FLAG_BIND_PRINC_STATEID |
+	EXCHGID4_FLAG_SUPP_MOVED_MIGR,
+#else
+	calldata->args.flags = EXCHGID4_FLAG_SUPP_MOVED_REFER |
+	EXCHGID4_FLAG_BIND_PRINC_STATEID,
+#endif
+	msg.rpc_argp = &calldata->args;
+	msg.rpc_resp = &calldata->res;
+	task_setup_data.callback_data = calldata;
 
-	status = rpc_call_sync(clp->cl_rpcclient, &msg, RPC_TASK_TIMEOUT);
-	trace_nfs4_exchange_id(clp, status);
-	if (status == 0)
-		status = nfs4_check_cl_exchange_flags(res.flags);
-
-	if (status == 0)
-		status = nfs4_sp4_select_mode(clp, &res.state_protect);
-
-	if (status == 0) {
-		clp->cl_clientid = res.clientid;
-		clp->cl_exchange_flags = res.flags;
-		/* Client ID is not confirmed */
-		if (!(res.flags & EXCHGID4_FLAG_CONFIRMED_R)) {
-			clear_bit(NFS4_SESSION_ESTABLISHED,
-					&clp->cl_session->session_state);
-			clp->cl_seqid = res.seqid;
-		}
-
-		kfree(clp->cl_serverowner);
-		clp->cl_serverowner = res.server_owner;
-		res.server_owner = NULL;
-
-		/* use the most recent implementation id */
-		kfree(clp->cl_implid);
-		clp->cl_implid = res.impl_id;
-		res.impl_id = NULL;
-
-		if (clp->cl_serverscope != NULL &&
-		    !nfs41_same_server_scope(clp->cl_serverscope,
-					     res.server_scope)) {
-			dprintk("%s: server_scope mismatch detected\n",
-				__func__);
-			set_bit(NFS4CLNT_SERVER_SCOPE_MISMATCH, &clp->cl_state);
-			kfree(clp->cl_serverscope);
-			clp->cl_serverscope = NULL;
-		}
-
-		if (clp->cl_serverscope == NULL) {
-			clp->cl_serverscope = res.server_scope;
-			res.server_scope = NULL;
-		}
+	task = rpc_run_task(&task_setup_data);
+	if (IS_ERR(task)) {
+	status = PTR_ERR(task);
+		goto out_impl_id;
 	}
 
-out_impl_id:
-	kfree(res.impl_id);
-out_server_scope:
-	kfree(res.server_scope);
-out_server_owner:
-	kfree(res.server_owner);
+	if (!xprt) {
+		status = rpc_wait_for_completion_task(task);
+		if (!status)
+			status = calldata->rpc_status;
+	} else	/* session trunking test */
+		status = calldata->rpc_status;
+
+	rpc_put_task(task);
 out:
 	if (clp->cl_implid != NULL)
 		dprintk("NFS reply exchange_id: Server Implementation ID: "
@@ -7240,6 +7548,16 @@ out:
 			clp->cl_implid->date.nseconds);
 	dprintk("NFS reply exchange_id: %d\n", status);
 	return status;
+
+out_impl_id:
+	kfree(calldata->res.impl_id);
+out_server_scope:
+	kfree(calldata->res.server_scope);
+out_server_owner:
+	kfree(calldata->res.server_owner);
+out_calldata:
+	kfree(calldata);
+	goto out;
 }
 
 /*
@@ -7262,14 +7580,45 @@ int nfs4_proc_exchange_id(struct nfs_client *clp, struct rpc_cred *cred)
 	/* try SP4_MACH_CRED if krb5i/p	*/
 	if (authflavor == RPC_AUTH_GSS_KRB5I ||
 	    authflavor == RPC_AUTH_GSS_KRB5P) {
-		status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED);
+		status = _nfs4_proc_exchange_id(clp, cred, SP4_MACH_CRED, NULL);
 		if (!status)
 			return 0;
 	}
 
 	/* try SP4_NONE */
-	return _nfs4_proc_exchange_id(clp, cred, SP4_NONE);
+	return _nfs4_proc_exchange_id(clp, cred, SP4_NONE, NULL);
+}
+
+/**
+ * nfs4_test_session_trunk
+ *
+ * This is an add_xprt_test() test function called from
+ * rpc_clnt_setup_test_and_add_xprt.
+ *
+ * The rpc_xprt_switch is referrenced by rpc_clnt_setup_test_and_add_xprt
+ * and is dereferrenced in nfs4_exchange_id_release
+ *
+ * Upon success, add the new transport to the rpc_clnt
+ *
+ * @clnt: struct rpc_clnt to get new transport
+ * @xprt: the rpc_xprt to test
+ * @data: call data for _nfs4_proc_exchange_id.
+ */
+int nfs4_test_session_trunk(struct rpc_clnt *clnt, struct rpc_xprt *xprt,
+			    void *data)
+{
+	struct nfs4_add_xprt_data *adata = (struct nfs4_add_xprt_data *)data;
+	u32 sp4_how;
+
+	dprintk("--> %s try %s\n", __func__,
+		xprt->address_strings[RPC_DISPLAY_ADDR]);
+
+	sp4_how = (adata->clp->cl_sp4_flags == 0 ? SP4_NONE : SP4_MACH_CRED);
+
+	/* Test connection for session trunking. Async exchange_id call */
+	return  _nfs4_proc_exchange_id(adata->clp, adata->cred, sp4_how, xprt);
 }
+EXPORT_SYMBOL_GPL(nfs4_test_session_trunk);
 
 static int _nfs4_proc_destroy_clientid(struct nfs_client *clp,
 		struct rpc_cred *cred)
@@ -7463,7 +7812,7 @@ static void nfs4_init_channel_attrs(struct nfs41_create_session_args *args,
 	args->bc_attrs.max_resp_sz = max_bc_payload;
 	args->bc_attrs.max_resp_sz_cached = 0;
 	args->bc_attrs.max_ops = NFS4_MAX_BACK_CHANNEL_OPS;
-	args->bc_attrs.max_reqs = NFS41_BC_MAX_CALLBACKS;
+	args->bc_attrs.max_reqs = min_t(unsigned short, max_session_cb_slots, 1);
 
 	dprintk("%s: Back Channel : max_rqst_sz=%u max_resp_sz=%u "
 		"max_resp_sz_cached=%u max_ops=%u max_reqs=%u\n",
@@ -7510,10 +7859,9 @@ static int nfs4_verify_back_channel_attrs(struct nfs41_create_session_args *args
 		return -EINVAL;
 	if (rcvd->max_resp_sz_cached > sent->max_resp_sz_cached)
 		return -EINVAL;
-	/* These would render the backchannel useless: */
-	if (rcvd->max_ops != sent->max_ops)
+	if (rcvd->max_ops > sent->max_ops)
 		return -EINVAL;
-	if (rcvd->max_reqs != sent->max_reqs)
+	if (rcvd->max_reqs > sent->max_reqs)
 		return -EINVAL;
 out:
 	return 0;
@@ -7982,6 +8330,8 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
 	case -NFS4ERR_RECALLCONFLICT:
 		status = -ERECALLCONFLICT;
 		break;
+	case -NFS4ERR_DELEG_REVOKED:
+	case -NFS4ERR_ADMIN_REVOKED:
 	case -NFS4ERR_EXPIRED:
 	case -NFS4ERR_BAD_STATEID:
 		exception->timeout = 0;
@@ -7993,6 +8343,7 @@ nfs4_layoutget_handle_exception(struct rpc_task *task,
 					&lgp->args.ctx->state->stateid)) {
 			spin_unlock(&inode->i_lock);
 			exception->state = lgp->args.ctx->state;
+			exception->stateid = &lgp->args.stateid;
 			break;
 		}
 
@@ -8591,6 +8942,24 @@ static int _nfs41_test_stateid(struct nfs_server *server,
 	return -res.status;
 }
 
+static void nfs4_handle_delay_or_session_error(struct nfs_server *server,
+		int err, struct nfs4_exception *exception)
+{
+	exception->retry = 0;
+	switch(err) {
+	case -NFS4ERR_DELAY:
+	case -NFS4ERR_RETRY_UNCACHED_REP:
+		nfs4_handle_exception(server, err, exception);
+		break;
+	case -NFS4ERR_BADSESSION:
+	case -NFS4ERR_BADSLOT:
+	case -NFS4ERR_BAD_HIGH_SLOT:
+	case -NFS4ERR_CONN_NOT_BOUND_TO_SESSION:
+	case -NFS4ERR_DEADSESSION:
+		nfs4_do_handle_exception(server, err, exception);
+	}
+}
+
 /**
  * nfs41_test_stateid - perform a TEST_STATEID operation
  *
@@ -8610,9 +8979,7 @@ static int nfs41_test_stateid(struct nfs_server *server,
 	int err;
 	do {
 		err = _nfs41_test_stateid(server, stateid, cred);
-		if (err != -NFS4ERR_DELAY)
-			break;
-		nfs4_handle_exception(server, err, &exception);
+		nfs4_handle_delay_or_session_error(server, err, &exception);
 	} while (exception.retry);
 	return err;
 }
@@ -8657,7 +9024,7 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = {
 };
 
 static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
-		nfs4_stateid *stateid,
+		const nfs4_stateid *stateid,
 		struct rpc_cred *cred,
 		bool privileged)
 {
@@ -8687,7 +9054,7 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
 
 	msg.rpc_argp = &data->args;
 	msg.rpc_resp = &data->res;
-	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 0);
+	nfs4_init_sequence(&data->args.seq_args, &data->res.seq_res, 1);
 	if (privileged)
 		nfs4_set_sequence_privileged(&data->args.seq_args);
 
@@ -8700,38 +9067,31 @@ static struct rpc_task *_nfs41_free_stateid(struct nfs_server *server,
  * @server: server / transport on which to perform the operation
  * @stateid: state ID to release
  * @cred: credential
+ * @is_recovery: set to true if this call needs to be privileged
  *
- * Returns NFS_OK if the server freed "stateid".  Otherwise a
- * negative NFS4ERR value is returned.
+ * Note: this function is always asynchronous.
  */
 static int nfs41_free_stateid(struct nfs_server *server,
-		nfs4_stateid *stateid,
-		struct rpc_cred *cred)
+		const nfs4_stateid *stateid,
+		struct rpc_cred *cred,
+		bool is_recovery)
 {
 	struct rpc_task *task;
-	int ret;
 
-	task = _nfs41_free_stateid(server, stateid, cred, true);
+	task = _nfs41_free_stateid(server, stateid, cred, is_recovery);
 	if (IS_ERR(task))
 		return PTR_ERR(task);
-	ret = rpc_wait_for_completion_task(task);
-	if (!ret)
-		ret = task->tk_status;
 	rpc_put_task(task);
-	return ret;
+	return 0;
 }
 
 static void
 nfs41_free_lock_state(struct nfs_server *server, struct nfs4_lock_state *lsp)
 {
-	struct rpc_task *task;
 	struct rpc_cred *cred = lsp->ls_state->owner->so_cred;
 
-	task = _nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
+	nfs41_free_stateid(server, &lsp->ls_stateid, cred, false);
 	nfs4_free_lock_state(server, lsp);
-	if (IS_ERR(task))
-		return;
-	rpc_put_task(task);
 }
 
 static bool nfs41_match_stateid(const nfs4_stateid *s1,
@@ -8835,6 +9195,7 @@ static const struct nfs4_minor_version_ops nfs_v4_0_minor_ops = {
 	.match_stateid = nfs4_match_stateid,
 	.find_root_sec = nfs4_find_root_sec,
 	.free_lock_state = nfs4_release_lockowner,
+	.test_and_free_expired = nfs40_test_and_free_expired_stateid,
 	.alloc_seqid = nfs_alloc_seqid,
 	.call_sync_ops = &nfs40_call_sync_ops,
 	.reboot_recovery_ops = &nfs40_reboot_recovery_ops,
@@ -8862,7 +9223,9 @@ static const struct nfs4_minor_version_ops nfs_v4_1_minor_ops = {
 	.match_stateid = nfs41_match_stateid,
 	.find_root_sec = nfs41_find_root_sec,
 	.free_lock_state = nfs41_free_lock_state,
+	.test_and_free_expired = nfs41_test_and_free_expired_stateid,
 	.alloc_seqid = nfs_alloc_no_seqid,
+	.session_trunk = nfs4_test_session_trunk,
 	.call_sync_ops = &nfs41_call_sync_ops,
 	.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
 	.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
@@ -8891,7 +9254,9 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = {
 	.find_root_sec = nfs41_find_root_sec,
 	.free_lock_state = nfs41_free_lock_state,
 	.call_sync_ops = &nfs41_call_sync_ops,
+	.test_and_free_expired = nfs41_test_and_free_expired_stateid,
 	.alloc_seqid = nfs_alloc_no_seqid,
+	.session_trunk = nfs4_test_session_trunk,
 	.reboot_recovery_ops = &nfs41_reboot_recovery_ops,
 	.nograce_recovery_ops = &nfs41_nograce_recovery_ops,
 	.state_renewal_ops = &nfs41_state_renewal_ops,
diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h
index f703b755351b..dae385500005 100644
--- a/fs/nfs/nfs4session.h
+++ b/fs/nfs/nfs4session.h
@@ -9,6 +9,7 @@
 
 /* maximum number of slots to use */
 #define NFS4_DEF_SLOT_TABLE_SIZE (64U)
+#define NFS4_DEF_CB_SLOT_TABLE_SIZE (1U)
 #define NFS4_MAX_SLOT_TABLE (1024U)
 #define NFS4_NO_SLOT ((u32)-1)
 
@@ -22,6 +23,7 @@ struct nfs4_slot {
 	u32			slot_nr;
 	u32		 	seq_nr;
 	unsigned int		interrupted : 1,
+				privileged : 1,
 				seq_done : 1;
 };
 
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index cada00aa5096..5f4281ec5f72 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -991,6 +991,8 @@ int nfs4_select_rw_stateid(struct nfs4_state *state,
 {
 	int ret;
 
+	if (!nfs4_valid_open_stateid(state))
+		return -EIO;
 	if (cred != NULL)
 		*cred = NULL;
 	ret = nfs4_copy_lock_stateid(dst, state, lockowner);
@@ -1303,6 +1305,8 @@ void nfs4_schedule_path_down_recovery(struct nfs_client *clp)
 static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_state *state)
 {
 
+	if (!nfs4_valid_open_stateid(state))
+		return 0;
 	set_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
 	/* Don't recover state that expired before the reboot */
 	if (test_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags)) {
@@ -1316,6 +1320,8 @@ static int nfs4_state_mark_reclaim_reboot(struct nfs_client *clp, struct nfs4_st
 
 int nfs4_state_mark_reclaim_nograce(struct nfs_client *clp, struct nfs4_state *state)
 {
+	if (!nfs4_valid_open_stateid(state))
+		return 0;
 	set_bit(NFS_STATE_RECLAIM_NOGRACE, &state->flags);
 	clear_bit(NFS_STATE_RECLAIM_REBOOT, &state->flags);
 	set_bit(NFS_OWNER_RECLAIM_NOGRACE, &state->owner->so_flags);
@@ -1327,9 +1333,8 @@ int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_
 {
 	struct nfs_client *clp = server->nfs_client;
 
-	if (!nfs4_valid_open_stateid(state))
+	if (!nfs4_state_mark_reclaim_nograce(clp, state))
 		return -EBADF;
-	nfs4_state_mark_reclaim_nograce(clp, state);
 	dprintk("%s: scheduling stateid recovery for server %s\n", __func__,
 			clp->cl_hostname);
 	nfs4_schedule_state_manager(clp);
@@ -1337,6 +1342,35 @@ int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_
 }
 EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery);
 
+static struct nfs4_lock_state *
+nfs_state_find_lock_state_by_stateid(struct nfs4_state *state,
+		const nfs4_stateid *stateid)
+{
+	struct nfs4_lock_state *pos;
+
+	list_for_each_entry(pos, &state->lock_states, ls_locks) {
+		if (!test_bit(NFS_LOCK_INITIALIZED, &pos->ls_flags))
+			continue;
+		if (nfs4_stateid_match_other(&pos->ls_stateid, stateid))
+			return pos;
+	}
+	return NULL;
+}
+
+static bool nfs_state_lock_state_matches_stateid(struct nfs4_state *state,
+		const nfs4_stateid *stateid)
+{
+	bool found = false;
+
+	if (test_bit(LK_STATE_IN_USE, &state->flags)) {
+		spin_lock(&state->state_lock);
+		if (nfs_state_find_lock_state_by_stateid(state, stateid))
+			found = true;
+		spin_unlock(&state->state_lock);
+	}
+	return found;
+}
+
 void nfs_inode_find_state_and_recover(struct inode *inode,
 		const nfs4_stateid *stateid)
 {
@@ -1351,14 +1385,18 @@ void nfs_inode_find_state_and_recover(struct inode *inode,
 		state = ctx->state;
 		if (state == NULL)
 			continue;
-		if (!test_bit(NFS_DELEGATED_STATE, &state->flags))
+		if (nfs4_stateid_match_other(&state->stateid, stateid) &&
+		    nfs4_state_mark_reclaim_nograce(clp, state)) {
+			found = true;
 			continue;
-		if (!nfs4_stateid_match(&state->stateid, stateid))
-			continue;
-		nfs4_state_mark_reclaim_nograce(clp, state);
-		found = true;
+		}
+		if (nfs_state_lock_state_matches_stateid(state, stateid) &&
+		    nfs4_state_mark_reclaim_nograce(clp, state))
+			found = true;
 	}
 	spin_unlock(&inode->i_lock);
+
+	nfs_inode_find_delegation_state_and_recover(inode, stateid);
 	if (found)
 		nfs4_schedule_state_manager(clp);
 }
@@ -1498,6 +1536,9 @@ restart:
 					__func__, status);
 			case -ENOENT:
 			case -ENOMEM:
+			case -EACCES:
+			case -EROFS:
+			case -EIO:
 			case -ESTALE:
 				/* Open state on this file cannot be recovered */
 				nfs4_state_mark_recovery_failed(state, status);
@@ -1656,15 +1697,9 @@ static void nfs4_state_end_reclaim_reboot(struct nfs_client *clp)
 	put_rpccred(cred);
 }
 
-static void nfs_delegation_clear_all(struct nfs_client *clp)
-{
-	nfs_delegation_mark_reclaim(clp);
-	nfs_delegation_reap_unclaimed(clp);
-}
-
 static void nfs4_state_start_reclaim_nograce(struct nfs_client *clp)
 {
-	nfs_delegation_clear_all(clp);
+	nfs_mark_test_expired_all_delegations(clp);
 	nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
 }
 
@@ -2195,7 +2230,7 @@ static void nfs41_handle_all_state_revoked(struct nfs_client *clp)
 
 static void nfs41_handle_some_state_revoked(struct nfs_client *clp)
 {
-	nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_nograce);
+	nfs4_state_start_reclaim_nograce(clp);
 	nfs4_schedule_state_manager(clp);
 
 	dprintk("%s: state revoked on server %s\n", __func__, clp->cl_hostname);
@@ -2227,13 +2262,22 @@ static void nfs41_handle_cb_path_down(struct nfs_client *clp)
 		nfs4_schedule_state_manager(clp);
 }
 
-void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
+void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags,
+		bool recovery)
 {
 	if (!flags)
 		return;
 
 	dprintk("%s: \"%s\" (client ID %llx) flags=0x%08x\n",
 		__func__, clp->cl_hostname, clp->cl_clientid, flags);
+	/*
+	 * If we're called from the state manager thread, then assume we're
+	 * already handling the RECLAIM_NEEDED and/or STATE_REVOKED.
+	 * Those flags are expected to remain set until we're done
+	 * recovering (see RFC5661, section 18.46.3).
+	 */
+	if (recovery)
+		goto out_recovery;
 
 	if (flags & SEQ4_STATUS_RESTART_RECLAIM_NEEDED)
 		nfs41_handle_server_reboot(clp);
@@ -2246,6 +2290,7 @@ void nfs41_handle_sequence_flag_errors(struct nfs_client *clp, u32 flags)
 		nfs4_schedule_lease_moved_recovery(clp);
 	if (flags & SEQ4_STATUS_RECALLABLE_STATE_REVOKED)
 		nfs41_handle_recallable_state_revoked(clp);
+out_recovery:
 	if (flags & SEQ4_STATUS_BACKCHANNEL_FAULT)
 		nfs41_handle_backchannel_fault(clp);
 	else if (flags & (SEQ4_STATUS_CB_PATH_DOWN |
@@ -2410,6 +2455,13 @@ static void nfs4_state_manager(struct nfs_client *clp)
 			nfs4_state_end_reclaim_reboot(clp);
 		}
 
+		/* Detect expired delegations... */
+		if (test_and_clear_bit(NFS4CLNT_DELEGATION_EXPIRED, &clp->cl_state)) {
+			section = "detect expired delegations";
+			nfs_reap_expired_delegations(clp);
+			continue;
+		}
+
 		/* Now recover expired state... */
 		if (test_and_clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state)) {
 			section = "reclaim nograce";
diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c
index 7bd3a5c09d31..fc89e5ed07ee 100644
--- a/fs/nfs/nfs4xdr.c
+++ b/fs/nfs/nfs4xdr.c
@@ -1850,7 +1850,7 @@ static void encode_create_session(struct xdr_stream *xdr,
 	*p++ = cpu_to_be32(RPC_AUTH_UNIX);			/* auth_sys */
 
 	/* authsys_parms rfc1831 */
-	*p++ = cpu_to_be32(nn->boot_time.tv_nsec);	/* stamp */
+	*p++ = cpu_to_be32(ktime_to_ns(nn->boot_time));	/* stamp */
 	p = xdr_encode_array(p, clnt->cl_nodename, clnt->cl_nodelen);
 	*p++ = cpu_to_be32(0);				/* UID */
 	*p++ = cpu_to_be32(0);				/* GID */
@@ -4725,34 +4725,37 @@ static int decode_getfattr(struct xdr_stream *xdr, struct nfs_fattr *fattr,
 }
 
 /*
- * Decode potentially multiple layout types. Currently we only support
- * one layout driver per file system.
+ * Decode potentially multiple layout types.
  */
-static int decode_first_pnfs_layout_type(struct xdr_stream *xdr,
-					 uint32_t *layouttype)
+static int decode_pnfs_layout_types(struct xdr_stream *xdr,
+				    struct nfs_fsinfo *fsinfo)
 {
 	__be32 *p;
-	int num;
+	uint32_t i;
 
 	p = xdr_inline_decode(xdr, 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	num = be32_to_cpup(p);
+	fsinfo->nlayouttypes = be32_to_cpup(p);
 
 	/* pNFS is not supported by the underlying file system */
-	if (num == 0) {
-		*layouttype = 0;
+	if (fsinfo->nlayouttypes == 0)
 		return 0;
-	}
-	if (num > 1)
-		printk(KERN_INFO "NFS: %s: Warning: Multiple pNFS layout "
-			"drivers per filesystem not supported\n", __func__);
 
 	/* Decode and set first layout type, move xdr->p past unused types */
-	p = xdr_inline_decode(xdr, num * 4);
+	p = xdr_inline_decode(xdr, fsinfo->nlayouttypes * 4);
 	if (unlikely(!p))
 		goto out_overflow;
-	*layouttype = be32_to_cpup(p);
+
+	/* If we get too many, then just cap it at the max */
+	if (fsinfo->nlayouttypes > NFS_MAX_LAYOUT_TYPES) {
+		printk(KERN_INFO "NFS: %s: Warning: Too many (%u) pNFS layout types\n",
+			__func__, fsinfo->nlayouttypes);
+		fsinfo->nlayouttypes = NFS_MAX_LAYOUT_TYPES;
+	}
+
+	for(i = 0; i < fsinfo->nlayouttypes; ++i)
+		fsinfo->layouttype[i] = be32_to_cpup(p++);
 	return 0;
 out_overflow:
 	print_overflow_msg(__func__, xdr);
@@ -4764,7 +4767,7 @@ out_overflow:
  * Note we must ensure that layouttype is set in any non-error case.
  */
 static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
-				uint32_t *layouttype)
+				struct nfs_fsinfo *fsinfo)
 {
 	int status = 0;
 
@@ -4772,10 +4775,9 @@ static int decode_attr_pnfstype(struct xdr_stream *xdr, uint32_t *bitmap,
 	if (unlikely(bitmap[1] & (FATTR4_WORD1_FS_LAYOUT_TYPES - 1U)))
 		return -EIO;
 	if (bitmap[1] & FATTR4_WORD1_FS_LAYOUT_TYPES) {
-		status = decode_first_pnfs_layout_type(xdr, layouttype);
+		status = decode_pnfs_layout_types(xdr, fsinfo);
 		bitmap[1] &= ~FATTR4_WORD1_FS_LAYOUT_TYPES;
-	} else
-		*layouttype = 0;
+	}
 	return status;
 }
 
@@ -4856,7 +4858,7 @@ static int decode_fsinfo(struct xdr_stream *xdr, struct nfs_fsinfo *fsinfo)
 	status = decode_attr_time_delta(xdr, bitmap, &fsinfo->time_delta);
 	if (status != 0)
 		goto xdr_error;
-	status = decode_attr_pnfstype(xdr, bitmap, &fsinfo->layouttype);
+	status = decode_attr_pnfstype(xdr, bitmap, fsinfo);
 	if (status != 0)
 		goto xdr_error;
 
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 2c93a85eda51..56b2d96f9103 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -30,6 +30,7 @@
 #include <linux/nfs_fs.h>
 #include <linux/nfs_page.h>
 #include <linux/module.h>
+#include <linux/sort.h>
 #include "internal.h"
 #include "pnfs.h"
 #include "iostat.h"
@@ -99,35 +100,79 @@ unset_pnfs_layoutdriver(struct nfs_server *nfss)
 }
 
 /*
+ * When the server sends a list of layout types, we choose one in the order
+ * given in the list below.
+ *
+ * FIXME: should this list be configurable in some fashion? module param?
+ * 	  mount option? something else?
+ */
+static const u32 ld_prefs[] = {
+	LAYOUT_SCSI,
+	LAYOUT_BLOCK_VOLUME,
+	LAYOUT_OSD2_OBJECTS,
+	LAYOUT_FLEX_FILES,
+	LAYOUT_NFSV4_1_FILES,
+	0
+};
+
+static int
+ld_cmp(const void *e1, const void *e2)
+{
+	u32 ld1 = *((u32 *)e1);
+	u32 ld2 = *((u32 *)e2);
+	int i;
+
+	for (i = 0; ld_prefs[i] != 0; i++) {
+		if (ld1 == ld_prefs[i])
+			return -1;
+
+		if (ld2 == ld_prefs[i])
+			return 1;
+	}
+	return 0;
+}
+
+/*
  * Try to set the server's pnfs module to the pnfs layout type specified by id.
  * Currently only one pNFS layout driver per filesystem is supported.
  *
- * @id layout type. Zero (illegal layout type) indicates pNFS not in use.
+ * @ids array of layout types supported by MDS.
  */
 void
 set_pnfs_layoutdriver(struct nfs_server *server, const struct nfs_fh *mntfh,
-		      u32 id)
+		      struct nfs_fsinfo *fsinfo)
 {
 	struct pnfs_layoutdriver_type *ld_type = NULL;
+	u32 id;
+	int i;
 
-	if (id == 0)
-		goto out_no_driver;
 	if (!(server->nfs_client->cl_exchange_flags &
 		 (EXCHGID4_FLAG_USE_NON_PNFS | EXCHGID4_FLAG_USE_PNFS_MDS))) {
-		printk(KERN_ERR "NFS: %s: id %u cl_exchange_flags 0x%x\n",
-			__func__, id, server->nfs_client->cl_exchange_flags);
+		printk(KERN_ERR "NFS: %s: cl_exchange_flags 0x%x\n",
+			__func__, server->nfs_client->cl_exchange_flags);
 		goto out_no_driver;
 	}
-	ld_type = find_pnfs_driver(id);
-	if (!ld_type) {
-		request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX, id);
+
+	sort(fsinfo->layouttype, fsinfo->nlayouttypes,
+		sizeof(*fsinfo->layouttype), ld_cmp, NULL);
+
+	for (i = 0; i < fsinfo->nlayouttypes; i++) {
+		id = fsinfo->layouttype[i];
 		ld_type = find_pnfs_driver(id);
 		if (!ld_type) {
-			dprintk("%s: No pNFS module found for %u.\n",
-				__func__, id);
-			goto out_no_driver;
+			request_module("%s-%u", LAYOUT_NFSV4_1_MODULE_PREFIX,
+					id);
+			ld_type = find_pnfs_driver(id);
 		}
+		if (ld_type)
+			break;
+	}
+
+	if (!ld_type) {
+		dprintk("%s: No pNFS module found!\n", __func__);
+		goto out_no_driver;
 	}
+
 	server->pnfs_curr_ld = ld_type;
 	if (ld_type->set_layoutdriver
 	    && ld_type->set_layoutdriver(server, mntfh)) {
@@ -2185,10 +2230,8 @@ static void pnfs_ld_handle_read_error(struct nfs_pgio_header *hdr)
  */
 void pnfs_ld_read_done(struct nfs_pgio_header *hdr)
 {
-	if (likely(!hdr->pnfs_error)) {
-		__nfs4_read_done_cb(hdr);
+	if (likely(!hdr->pnfs_error))
 		hdr->mds_ops->rpc_call_done(&hdr->task, hdr);
-	}
 	trace_nfs4_pnfs_read(hdr, hdr->pnfs_error);
 	if (unlikely(hdr->pnfs_error))
 		pnfs_ld_handle_read_error(hdr);
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 31d99b2927b0..5c295512c967 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -236,7 +236,7 @@ void pnfs_get_layout_hdr(struct pnfs_layout_hdr *lo);
 void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
 void pnfs_put_lseg_locked(struct pnfs_layout_segment *lseg);
 
-void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, u32);
+void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *);
 void unset_pnfs_layoutdriver(struct nfs_server *);
 void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
 int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
@@ -657,7 +657,8 @@ pnfs_wait_on_layoutreturn(struct inode *ino, struct rpc_task *task)
 }
 
 static inline void set_pnfs_layoutdriver(struct nfs_server *s,
-					 const struct nfs_fh *mntfh, u32 id)
+					 const struct nfs_fh *mntfh,
+					 struct nfs_fsinfo *fsinfo)
 {
 }
 
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index f3468b57a32a..53b4705abcc7 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -690,13 +690,50 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
 		dprintk("%s: DS %s: trying address %s\n",
 			__func__, ds->ds_remotestr, da->da_remotestr);
 
-		clp = nfs4_set_ds_client(mds_srv,
-					(struct sockaddr *)&da->da_addr,
-					da->da_addrlen, IPPROTO_TCP,
-					timeo, retrans, minor_version,
-					au_flavor);
-		if (!IS_ERR(clp))
-			break;
+		if (!IS_ERR(clp) && clp->cl_mvops->session_trunk) {
+			struct xprt_create xprt_args = {
+				.ident = XPRT_TRANSPORT_TCP,
+				.net = clp->cl_net,
+				.dstaddr = (struct sockaddr *)&da->da_addr,
+				.addrlen = da->da_addrlen,
+				.servername = clp->cl_hostname,
+			};
+			struct nfs4_add_xprt_data xprtdata = {
+				.clp = clp,
+				.cred = nfs4_get_clid_cred(clp),
+			};
+			struct rpc_add_xprt_test rpcdata = {
+				.add_xprt_test = clp->cl_mvops->session_trunk,
+				.data = &xprtdata,
+			};
+
+			/**
+			* Test this address for session trunking and
+			* add as an alias
+			*/
+			rpc_clnt_add_xprt(clp->cl_rpcclient, &xprt_args,
+					  rpc_clnt_setup_test_and_add_xprt,
+					  &rpcdata);
+			if (xprtdata.cred)
+				put_rpccred(xprtdata.cred);
+		} else {
+			clp = nfs4_set_ds_client(mds_srv,
+						(struct sockaddr *)&da->da_addr,
+						da->da_addrlen, IPPROTO_TCP,
+						timeo, retrans, minor_version,
+						au_flavor);
+			if (IS_ERR(clp))
+				continue;
+
+			status = nfs4_init_ds_session(clp,
+					mds_srv->nfs_client->cl_lease_time);
+			if (status) {
+				nfs_put_client(clp);
+				clp = ERR_PTR(-EIO);
+				continue;
+			}
+
+		}
 	}
 
 	if (IS_ERR(clp)) {
@@ -704,18 +741,11 @@ static int _nfs4_pnfs_v4_ds_connect(struct nfs_server *mds_srv,
 		goto out;
 	}
 
-	status = nfs4_init_ds_session(clp, mds_srv->nfs_client->cl_lease_time);
-	if (status)
-		goto out_put;
-
 	smp_wmb();
 	ds->ds_clp = clp;
 	dprintk("%s [new] addr: %s\n", __func__, ds->ds_remotestr);
 out:
 	return status;
-out_put:
-	nfs_put_client(clp);
-	goto out;
 }
 
 /*
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index d39601381adf..001796bcd6c8 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -2848,19 +2848,23 @@ out_invalid_transport_udp:
  * NFS client for backwards compatibility
  */
 unsigned int nfs_callback_set_tcpport;
+unsigned short nfs_callback_nr_threads;
 /* Default cache timeout is 10 minutes */
 unsigned int nfs_idmap_cache_timeout = 600;
 /* Turn off NFSv4 uid/gid mapping when using AUTH_SYS */
 bool nfs4_disable_idmapping = true;
 unsigned short max_session_slots = NFS4_DEF_SLOT_TABLE_SIZE;
+unsigned short max_session_cb_slots = NFS4_DEF_CB_SLOT_TABLE_SIZE;
 unsigned short send_implementation_id = 1;
 char nfs4_client_id_uniquifier[NFS4_CLIENT_ID_UNIQ_LEN] = "";
 bool recover_lost_locks = false;
 
+EXPORT_SYMBOL_GPL(nfs_callback_nr_threads);
 EXPORT_SYMBOL_GPL(nfs_callback_set_tcpport);
 EXPORT_SYMBOL_GPL(nfs_idmap_cache_timeout);
 EXPORT_SYMBOL_GPL(nfs4_disable_idmapping);
 EXPORT_SYMBOL_GPL(max_session_slots);
+EXPORT_SYMBOL_GPL(max_session_cb_slots);
 EXPORT_SYMBOL_GPL(send_implementation_id);
 EXPORT_SYMBOL_GPL(nfs4_client_id_uniquifier);
 EXPORT_SYMBOL_GPL(recover_lost_locks);
@@ -2887,6 +2891,9 @@ static const struct kernel_param_ops param_ops_portnr = {
 #define param_check_portnr(name, p) __param_check(name, p, unsigned int);
 
 module_param_named(callback_tcpport, nfs_callback_set_tcpport, portnr, 0644);
+module_param_named(callback_nr_threads, nfs_callback_nr_threads, ushort, 0644);
+MODULE_PARM_DESC(callback_nr_threads, "Number of threads that will be "
+		"assigned to the NFSv4 callback channels.");
 module_param(nfs_idmap_cache_timeout, int, 0644);
 module_param(nfs4_disable_idmapping, bool, 0644);
 module_param_string(nfs4_unique_id, nfs4_client_id_uniquifier,
@@ -2896,6 +2903,9 @@ MODULE_PARM_DESC(nfs4_disable_idmapping,
 module_param(max_session_slots, ushort, 0644);
 MODULE_PARM_DESC(max_session_slots, "Maximum number of outstanding NFSv4.1 "
 		"requests the client will negotiate");
+module_param(max_session_cb_slots, ushort, 0644);
+MODULE_PARM_DESC(max_session_slots, "Maximum number of parallel NFSv4.1 "
+		"callbacks the client will process for a given server");
 module_param(send_implementation_id, ushort, 0644);
 MODULE_PARM_DESC(send_implementation_id,
 		"Send implementation ID with NFSv4.1 exchange_id");
diff --git a/fs/nfsd/flexfilelayout.c b/fs/nfsd/flexfilelayout.c
index df880e9fa71f..b67287383010 100644
--- a/fs/nfsd/flexfilelayout.c
+++ b/fs/nfsd/flexfilelayout.c
@@ -126,6 +126,7 @@ nfsd4_ff_proc_getdeviceinfo(struct super_block *sb, struct svc_rqst *rqstp,
 const struct nfsd4_layout_ops ff_layout_ops = {
 	.notify_types		=
 			NOTIFY_DEVICEID4_DELETE | NOTIFY_DEVICEID4_CHANGE,
+	.disable_recalls	= true,
 	.proc_getdeviceinfo	= nfsd4_ff_proc_getdeviceinfo,
 	.encode_getdeviceinfo	= nfsd4_ff_encode_getdeviceinfo,
 	.proc_layoutget		= nfsd4_ff_proc_layoutget,
diff --git a/fs/nfsd/netns.h b/fs/nfsd/netns.h
index 5fbf3bbd00d0..b10d557f9c9e 100644
--- a/fs/nfsd/netns.h
+++ b/fs/nfsd/netns.h
@@ -84,6 +84,7 @@ struct nfsd_net {
 	struct list_head client_lru;
 	struct list_head close_lru;
 	struct list_head del_recall_lru;
+	struct list_head blocked_locks_lru;
 
 	struct delayed_work laundromat_work;
 
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index 04c68d900324..211dc2aed8e1 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -448,7 +448,7 @@ static int decode_cb_sequence4res(struct xdr_stream *xdr,
 {
 	int status;
 
-	if (cb->cb_minorversion == 0)
+	if (cb->cb_clp->cl_minorversion == 0)
 		return 0;
 
 	status = decode_cb_op_status(xdr, OP_CB_SEQUENCE, &cb->cb_seq_status);
@@ -485,7 +485,7 @@ static void nfs4_xdr_enc_cb_recall(struct rpc_rqst *req, struct xdr_stream *xdr,
 	const struct nfs4_delegation *dp = cb_to_delegation(cb);
 	struct nfs4_cb_compound_hdr hdr = {
 		.ident = cb->cb_clp->cl_cb_ident,
-		.minorversion = cb->cb_minorversion,
+		.minorversion = cb->cb_clp->cl_minorversion,
 	};
 
 	encode_cb_compound4args(xdr, &hdr);
@@ -594,7 +594,7 @@ static void nfs4_xdr_enc_cb_layout(struct rpc_rqst *req,
 		container_of(cb, struct nfs4_layout_stateid, ls_recall);
 	struct nfs4_cb_compound_hdr hdr = {
 		.ident = 0,
-		.minorversion = cb->cb_minorversion,
+		.minorversion = cb->cb_clp->cl_minorversion,
 	};
 
 	encode_cb_compound4args(xdr, &hdr);
@@ -623,6 +623,62 @@ static int nfs4_xdr_dec_cb_layout(struct rpc_rqst *rqstp,
 }
 #endif /* CONFIG_NFSD_PNFS */
 
+static void encode_stateowner(struct xdr_stream *xdr, struct nfs4_stateowner *so)
+{
+	__be32	*p;
+
+	p = xdr_reserve_space(xdr, 8 + 4 + so->so_owner.len);
+	p = xdr_encode_opaque_fixed(p, &so->so_client->cl_clientid, 8);
+	xdr_encode_opaque(p, so->so_owner.data, so->so_owner.len);
+}
+
+static void nfs4_xdr_enc_cb_notify_lock(struct rpc_rqst *req,
+					struct xdr_stream *xdr,
+					const struct nfsd4_callback *cb)
+{
+	const struct nfsd4_blocked_lock *nbl =
+		container_of(cb, struct nfsd4_blocked_lock, nbl_cb);
+	struct nfs4_lockowner *lo = (struct nfs4_lockowner *)nbl->nbl_lock.fl_owner;
+	struct nfs4_cb_compound_hdr hdr = {
+		.ident = 0,
+		.minorversion = cb->cb_clp->cl_minorversion,
+	};
+
+	__be32 *p;
+
+	BUG_ON(hdr.minorversion == 0);
+
+	encode_cb_compound4args(xdr, &hdr);
+	encode_cb_sequence4args(xdr, cb, &hdr);
+
+	p = xdr_reserve_space(xdr, 4);
+	*p = cpu_to_be32(OP_CB_NOTIFY_LOCK);
+	encode_nfs_fh4(xdr, &nbl->nbl_fh);
+	encode_stateowner(xdr, &lo->lo_owner);
+	hdr.nops++;
+
+	encode_cb_nops(&hdr);
+}
+
+static int nfs4_xdr_dec_cb_notify_lock(struct rpc_rqst *rqstp,
+					struct xdr_stream *xdr,
+					struct nfsd4_callback *cb)
+{
+	struct nfs4_cb_compound_hdr hdr;
+	int status;
+
+	status = decode_cb_compound4res(xdr, &hdr);
+	if (unlikely(status))
+		return status;
+
+	if (cb) {
+		status = decode_cb_sequence4res(xdr, cb);
+		if (unlikely(status || cb->cb_seq_status))
+			return status;
+	}
+	return decode_cb_op_status(xdr, OP_CB_NOTIFY_LOCK, &cb->cb_status);
+}
+
 /*
  * RPC procedure tables
  */
@@ -643,6 +699,7 @@ static struct rpc_procinfo nfs4_cb_procedures[] = {
 #ifdef CONFIG_NFSD_PNFS
 	PROC(CB_LAYOUT,	COMPOUND,	cb_layout,	cb_layout),
 #endif
+	PROC(CB_NOTIFY_LOCK,	COMPOUND,	cb_notify_lock,	cb_notify_lock),
 };
 
 static struct rpc_version nfs_cb_version4 = {
@@ -862,7 +919,6 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
 	struct nfs4_client *clp = cb->cb_clp;
 	u32 minorversion = clp->cl_minorversion;
 
-	cb->cb_minorversion = minorversion;
 	/*
 	 * cb_seq_status is only set in decode_cb_sequence4res,
 	 * and so will remain 1 if an rpc level failure occurs.
diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c
index 2be9602b0221..42aace4fc4c8 100644
--- a/fs/nfsd/nfs4layouts.c
+++ b/fs/nfsd/nfs4layouts.c
@@ -174,7 +174,8 @@ nfsd4_free_layout_stateid(struct nfs4_stid *stid)
 	list_del_init(&ls->ls_perfile);
 	spin_unlock(&fp->fi_lock);
 
-	vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls);
+	if (!nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
+		vfs_setlease(ls->ls_file, F_UNLCK, NULL, (void **)&ls);
 	fput(ls->ls_file);
 
 	if (ls->ls_recalled)
@@ -189,6 +190,9 @@ nfsd4_layout_setlease(struct nfs4_layout_stateid *ls)
 	struct file_lock *fl;
 	int status;
 
+	if (nfsd4_layout_ops[ls->ls_layout_type]->disable_recalls)
+		return 0;
+
 	fl = locks_alloc_lock();
 	if (!fl)
 		return -ENOMEM;
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 1fb222752b2b..abb09b580389 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1010,47 +1010,97 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 }
 
 static __be32
-nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
-		struct nfsd4_clone *clone)
+nfsd4_verify_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		  stateid_t *src_stateid, struct file **src,
+		  stateid_t *dst_stateid, struct file **dst)
 {
-	struct file *src, *dst;
 	__be32 status;
 
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->save_fh,
-					    &clone->cl_src_stateid, RD_STATE,
-					    &src, NULL);
+					    src_stateid, RD_STATE, src, NULL);
 	if (status) {
 		dprintk("NFSD: %s: couldn't process src stateid!\n", __func__);
 		goto out;
 	}
 
 	status = nfs4_preprocess_stateid_op(rqstp, cstate, &cstate->current_fh,
-					    &clone->cl_dst_stateid, WR_STATE,
-					    &dst, NULL);
+					    dst_stateid, WR_STATE, dst, NULL);
 	if (status) {
 		dprintk("NFSD: %s: couldn't process dst stateid!\n", __func__);
 		goto out_put_src;
 	}
 
 	/* fix up for NFS-specific error code */
-	if (!S_ISREG(file_inode(src)->i_mode) ||
-	    !S_ISREG(file_inode(dst)->i_mode)) {
+	if (!S_ISREG(file_inode(*src)->i_mode) ||
+	    !S_ISREG(file_inode(*dst)->i_mode)) {
 		status = nfserr_wrong_type;
 		goto out_put_dst;
 	}
 
+out:
+	return status;
+out_put_dst:
+	fput(*dst);
+out_put_src:
+	fput(*src);
+	goto out;
+}
+
+static __be32
+nfsd4_clone(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		struct nfsd4_clone *clone)
+{
+	struct file *src, *dst;
+	__be32 status;
+
+	status = nfsd4_verify_copy(rqstp, cstate, &clone->cl_src_stateid, &src,
+				   &clone->cl_dst_stateid, &dst);
+	if (status)
+		goto out;
+
 	status = nfsd4_clone_file_range(src, clone->cl_src_pos,
 			dst, clone->cl_dst_pos, clone->cl_count);
 
-out_put_dst:
 	fput(dst);
-out_put_src:
 	fput(src);
 out:
 	return status;
 }
 
 static __be32
+nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
+		struct nfsd4_copy *copy)
+{
+	struct file *src, *dst;
+	__be32 status;
+	ssize_t bytes;
+
+	status = nfsd4_verify_copy(rqstp, cstate, &copy->cp_src_stateid, &src,
+				   &copy->cp_dst_stateid, &dst);
+	if (status)
+		goto out;
+
+	bytes = nfsd_copy_file_range(src, copy->cp_src_pos,
+			dst, copy->cp_dst_pos, copy->cp_count);
+
+	if (bytes < 0)
+		status = nfserrno(bytes);
+	else {
+		copy->cp_res.wr_bytes_written = bytes;
+		copy->cp_res.wr_stable_how = NFS_UNSTABLE;
+		copy->cp_consecutive = 1;
+		copy->cp_synchronous = 1;
+		gen_boot_verifier(&copy->cp_res.wr_verifier, SVC_NET(rqstp));
+		status = nfs_ok;
+	}
+
+	fput(src);
+	fput(dst);
+out:
+	return status;
+}
+
+static __be32
 nfsd4_fallocate(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		struct nfsd4_fallocate *fallocate, int flags)
 {
@@ -1966,6 +2016,18 @@ static inline u32 nfsd4_create_session_rsize(struct svc_rqst *rqstp, struct nfsd
 		op_encode_channel_attrs_maxsz) * sizeof(__be32);
 }
 
+static inline u32 nfsd4_copy_rsize(struct svc_rqst *rqstp, struct nfsd4_op *op)
+{
+	return (op_encode_hdr_size +
+		1 /* wr_callback */ +
+		op_encode_stateid_maxsz /* wr_callback */ +
+		2 /* wr_count */ +
+		1 /* wr_committed */ +
+		op_encode_verifier_maxsz +
+		1 /* cr_consecutive */ +
+		1 /* cr_synchronous */) * sizeof(__be32);
+}
+
 #ifdef CONFIG_NFSD_PNFS
 /*
  * At this stage we don't really know what layout driver will handle the request,
@@ -2328,6 +2390,12 @@ static struct nfsd4_operation nfsd4_ops[] = {
 		.op_name = "OP_CLONE",
 		.op_rsize_bop = (nfsd4op_rsize)nfsd4_only_status_rsize,
 	},
+	[OP_COPY] = {
+		.op_func = (nfsd4op_func)nfsd4_copy,
+		.op_flags = OP_MODIFIES_SOMETHING | OP_CACHEME,
+		.op_name = "OP_COPY",
+		.op_rsize_bop = (nfsd4op_rsize)nfsd4_copy_rsize,
+	},
 	[OP_SEEK] = {
 		.op_func = (nfsd4op_func)nfsd4_seek,
 		.op_name = "OP_SEEK",
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 39bfaba9c99c..9752beb78659 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -99,6 +99,7 @@ static struct kmem_cache *odstate_slab;
 static void free_session(struct nfsd4_session *);
 
 static const struct nfsd4_callback_ops nfsd4_cb_recall_ops;
+static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops;
 
 static bool is_session_dead(struct nfsd4_session *ses)
 {
@@ -210,6 +211,85 @@ static void nfsd4_put_session(struct nfsd4_session *ses)
 	spin_unlock(&nn->client_lock);
 }
 
+static struct nfsd4_blocked_lock *
+find_blocked_lock(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
+			struct nfsd_net *nn)
+{
+	struct nfsd4_blocked_lock *cur, *found = NULL;
+
+	spin_lock(&nn->client_lock);
+	list_for_each_entry(cur, &lo->lo_blocked, nbl_list) {
+		if (fh_match(fh, &cur->nbl_fh)) {
+			list_del_init(&cur->nbl_list);
+			list_del_init(&cur->nbl_lru);
+			found = cur;
+			break;
+		}
+	}
+	spin_unlock(&nn->client_lock);
+	if (found)
+		posix_unblock_lock(&found->nbl_lock);
+	return found;
+}
+
+static struct nfsd4_blocked_lock *
+find_or_allocate_block(struct nfs4_lockowner *lo, struct knfsd_fh *fh,
+			struct nfsd_net *nn)
+{
+	struct nfsd4_blocked_lock *nbl;
+
+	nbl = find_blocked_lock(lo, fh, nn);
+	if (!nbl) {
+		nbl= kmalloc(sizeof(*nbl), GFP_KERNEL);
+		if (nbl) {
+			fh_copy_shallow(&nbl->nbl_fh, fh);
+			locks_init_lock(&nbl->nbl_lock);
+			nfsd4_init_cb(&nbl->nbl_cb, lo->lo_owner.so_client,
+					&nfsd4_cb_notify_lock_ops,
+					NFSPROC4_CLNT_CB_NOTIFY_LOCK);
+		}
+	}
+	return nbl;
+}
+
+static void
+free_blocked_lock(struct nfsd4_blocked_lock *nbl)
+{
+	locks_release_private(&nbl->nbl_lock);
+	kfree(nbl);
+}
+
+static int
+nfsd4_cb_notify_lock_done(struct nfsd4_callback *cb, struct rpc_task *task)
+{
+	/*
+	 * Since this is just an optimization, we don't try very hard if it
+	 * turns out not to succeed. We'll requeue it on NFS4ERR_DELAY, and
+	 * just quit trying on anything else.
+	 */
+	switch (task->tk_status) {
+	case -NFS4ERR_DELAY:
+		rpc_delay(task, 1 * HZ);
+		return 0;
+	default:
+		return 1;
+	}
+}
+
+static void
+nfsd4_cb_notify_lock_release(struct nfsd4_callback *cb)
+{
+	struct nfsd4_blocked_lock	*nbl = container_of(cb,
+						struct nfsd4_blocked_lock, nbl_cb);
+
+	free_blocked_lock(nbl);
+}
+
+static const struct nfsd4_callback_ops nfsd4_cb_notify_lock_ops = {
+	.done		= nfsd4_cb_notify_lock_done,
+	.release	= nfsd4_cb_notify_lock_release,
+};
+
 static inline struct nfs4_stateowner *
 nfs4_get_stateowner(struct nfs4_stateowner *sop)
 {
@@ -3224,9 +3304,10 @@ nfsd4_setclientid_confirm(struct svc_rqst *rqstp,
 		goto out;
 	/* cases below refer to rfc 3530 section 14.2.34: */
 	if (!unconf || !same_verf(&confirm, &unconf->cl_confirm)) {
-		if (conf && !unconf) /* case 2: probable retransmit */
+		if (conf && same_verf(&confirm, &conf->cl_confirm)) {
+			/* case 2: probable retransmit */
 			status = nfs_ok;
-		else /* case 4: client hasn't noticed we rebooted yet? */
+		} else /* case 4: client hasn't noticed we rebooted yet? */
 			status = nfserr_stale_clientid;
 		goto out;
 	}
@@ -4410,9 +4491,11 @@ out:
 	* To finish the open response, we just need to set the rflags.
 	*/
 	open->op_rflags = NFS4_OPEN_RESULT_LOCKTYPE_POSIX;
-	if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED) &&
-	    !nfsd4_has_session(&resp->cstate))
+	if (nfsd4_has_session(&resp->cstate))
+		open->op_rflags |= NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK;
+	else if (!(open->op_openowner->oo_flags & NFS4_OO_CONFIRMED))
 		open->op_rflags |= NFS4_OPEN_RESULT_CONFIRM;
+
 	if (dp)
 		nfs4_put_stid(&dp->dl_stid);
 	if (stp)
@@ -4501,6 +4584,7 @@ nfs4_laundromat(struct nfsd_net *nn)
 	struct nfs4_openowner *oo;
 	struct nfs4_delegation *dp;
 	struct nfs4_ol_stateid *stp;
+	struct nfsd4_blocked_lock *nbl;
 	struct list_head *pos, *next, reaplist;
 	time_t cutoff = get_seconds() - nn->nfsd4_lease;
 	time_t t, new_timeo = nn->nfsd4_lease;
@@ -4569,6 +4653,41 @@ nfs4_laundromat(struct nfsd_net *nn)
 	}
 	spin_unlock(&nn->client_lock);
 
+	/*
+	 * It's possible for a client to try and acquire an already held lock
+	 * that is being held for a long time, and then lose interest in it.
+	 * So, we clean out any un-revisited request after a lease period
+	 * under the assumption that the client is no longer interested.
+	 *
+	 * RFC5661, sec. 9.6 states that the client must not rely on getting
+	 * notifications and must continue to poll for locks, even when the
+	 * server supports them. Thus this shouldn't lead to clients blocking
+	 * indefinitely once the lock does become free.
+	 */
+	BUG_ON(!list_empty(&reaplist));
+	spin_lock(&nn->client_lock);
+	while (!list_empty(&nn->blocked_locks_lru)) {
+		nbl = list_first_entry(&nn->blocked_locks_lru,
+					struct nfsd4_blocked_lock, nbl_lru);
+		if (time_after((unsigned long)nbl->nbl_time,
+			       (unsigned long)cutoff)) {
+			t = nbl->nbl_time - cutoff;
+			new_timeo = min(new_timeo, t);
+			break;
+		}
+		list_move(&nbl->nbl_lru, &reaplist);
+		list_del_init(&nbl->nbl_list);
+	}
+	spin_unlock(&nn->client_lock);
+
+	while (!list_empty(&reaplist)) {
+		nbl = list_first_entry(&nn->blocked_locks_lru,
+					struct nfsd4_blocked_lock, nbl_lru);
+		list_del_init(&nbl->nbl_lru);
+		posix_unblock_lock(&nbl->nbl_lock);
+		free_blocked_lock(nbl);
+	}
+
 	new_timeo = max_t(time_t, new_timeo, NFSD_LAUNDROMAT_MINTIMEOUT);
 	return new_timeo;
 }
@@ -5309,7 +5428,31 @@ nfsd4_fl_put_owner(fl_owner_t owner)
 		nfs4_put_stateowner(&lo->lo_owner);
 }
 
+static void
+nfsd4_lm_notify(struct file_lock *fl)
+{
+	struct nfs4_lockowner		*lo = (struct nfs4_lockowner *)fl->fl_owner;
+	struct net			*net = lo->lo_owner.so_client->net;
+	struct nfsd_net			*nn = net_generic(net, nfsd_net_id);
+	struct nfsd4_blocked_lock	*nbl = container_of(fl,
+						struct nfsd4_blocked_lock, nbl_lock);
+	bool queue = false;
+
+	/* An empty list means that something else is going to be using it */
+	spin_lock(&nn->client_lock);
+	if (!list_empty(&nbl->nbl_list)) {
+		list_del_init(&nbl->nbl_list);
+		list_del_init(&nbl->nbl_lru);
+		queue = true;
+	}
+	spin_unlock(&nn->client_lock);
+
+	if (queue)
+		nfsd4_run_cb(&nbl->nbl_cb);
+}
+
 static const struct lock_manager_operations nfsd_posix_mng_ops  = {
+	.lm_notify = nfsd4_lm_notify,
 	.lm_get_owner = nfsd4_fl_get_owner,
 	.lm_put_owner = nfsd4_fl_put_owner,
 };
@@ -5407,6 +5550,7 @@ alloc_init_lock_stateowner(unsigned int strhashval, struct nfs4_client *clp,
 	lo = alloc_stateowner(lockowner_slab, &lock->lk_new_owner, clp);
 	if (!lo)
 		return NULL;
+	INIT_LIST_HEAD(&lo->lo_blocked);
 	INIT_LIST_HEAD(&lo->lo_owner.so_stateids);
 	lo->lo_owner.so_is_open_owner = 0;
 	lo->lo_owner.so_seqid = lock->lk_new_lock_seqid;
@@ -5588,12 +5732,15 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	struct nfs4_ol_stateid *open_stp = NULL;
 	struct nfs4_file *fp;
 	struct file *filp = NULL;
+	struct nfsd4_blocked_lock *nbl = NULL;
 	struct file_lock *file_lock = NULL;
 	struct file_lock *conflock = NULL;
 	__be32 status = 0;
 	int lkflg;
 	int err;
 	bool new = false;
+	unsigned char fl_type;
+	unsigned int fl_flags = FL_POSIX;
 	struct net *net = SVC_NET(rqstp);
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
@@ -5658,46 +5805,55 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 	if (!locks_in_grace(net) && lock->lk_reclaim)
 		goto out;
 
-	file_lock = locks_alloc_lock();
-	if (!file_lock) {
-		dprintk("NFSD: %s: unable to allocate lock!\n", __func__);
-		status = nfserr_jukebox;
-		goto out;
-	}
-
 	fp = lock_stp->st_stid.sc_file;
 	switch (lock->lk_type) {
-		case NFS4_READ_LT:
 		case NFS4_READW_LT:
+			if (nfsd4_has_session(cstate))
+				fl_flags |= FL_SLEEP;
+			/* Fallthrough */
+		case NFS4_READ_LT:
 			spin_lock(&fp->fi_lock);
 			filp = find_readable_file_locked(fp);
 			if (filp)
 				get_lock_access(lock_stp, NFS4_SHARE_ACCESS_READ);
 			spin_unlock(&fp->fi_lock);
-			file_lock->fl_type = F_RDLCK;
+			fl_type = F_RDLCK;
 			break;
-		case NFS4_WRITE_LT:
 		case NFS4_WRITEW_LT:
+			if (nfsd4_has_session(cstate))
+				fl_flags |= FL_SLEEP;
+			/* Fallthrough */
+		case NFS4_WRITE_LT:
 			spin_lock(&fp->fi_lock);
 			filp = find_writeable_file_locked(fp);
 			if (filp)
 				get_lock_access(lock_stp, NFS4_SHARE_ACCESS_WRITE);
 			spin_unlock(&fp->fi_lock);
-			file_lock->fl_type = F_WRLCK;
+			fl_type = F_WRLCK;
 			break;
 		default:
 			status = nfserr_inval;
 		goto out;
 	}
+
 	if (!filp) {
 		status = nfserr_openmode;
 		goto out;
 	}
 
+	nbl = find_or_allocate_block(lock_sop, &fp->fi_fhandle, nn);
+	if (!nbl) {
+		dprintk("NFSD: %s: unable to allocate block!\n", __func__);
+		status = nfserr_jukebox;
+		goto out;
+	}
+
+	file_lock = &nbl->nbl_lock;
+	file_lock->fl_type = fl_type;
 	file_lock->fl_owner = (fl_owner_t)lockowner(nfs4_get_stateowner(&lock_sop->lo_owner));
 	file_lock->fl_pid = current->tgid;
 	file_lock->fl_file = filp;
-	file_lock->fl_flags = FL_POSIX;
+	file_lock->fl_flags = fl_flags;
 	file_lock->fl_lmops = &nfsd_posix_mng_ops;
 	file_lock->fl_start = lock->lk_offset;
 	file_lock->fl_end = last_byte_offset(lock->lk_offset, lock->lk_length);
@@ -5710,18 +5866,29 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		goto out;
 	}
 
+	if (fl_flags & FL_SLEEP) {
+		nbl->nbl_time = jiffies;
+		spin_lock(&nn->client_lock);
+		list_add_tail(&nbl->nbl_list, &lock_sop->lo_blocked);
+		list_add_tail(&nbl->nbl_lru, &nn->blocked_locks_lru);
+		spin_unlock(&nn->client_lock);
+	}
+
 	err = vfs_lock_file(filp, F_SETLK, file_lock, conflock);
-	switch (-err) {
+	switch (err) {
 	case 0: /* success! */
 		nfs4_inc_and_copy_stateid(&lock->lk_resp_stateid, &lock_stp->st_stid);
 		status = 0;
 		break;
-	case (EAGAIN):		/* conflock holds conflicting lock */
+	case FILE_LOCK_DEFERRED:
+		nbl = NULL;
+		/* Fallthrough */
+	case -EAGAIN:		/* conflock holds conflicting lock */
 		status = nfserr_denied;
 		dprintk("NFSD: nfsd4_lock: conflicting lock found!\n");
 		nfs4_set_lock_denied(conflock, &lock->lk_denied);
 		break;
-	case (EDEADLK):
+	case -EDEADLK:
 		status = nfserr_deadlock;
 		break;
 	default:
@@ -5730,6 +5897,16 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
 		break;
 	}
 out:
+	if (nbl) {
+		/* dequeue it if we queued it before */
+		if (fl_flags & FL_SLEEP) {
+			spin_lock(&nn->client_lock);
+			list_del_init(&nbl->nbl_list);
+			list_del_init(&nbl->nbl_lru);
+			spin_unlock(&nn->client_lock);
+		}
+		free_blocked_lock(nbl);
+	}
 	if (filp)
 		fput(filp);
 	if (lock_stp) {
@@ -5753,8 +5930,6 @@ out:
 	if (open_stp)
 		nfs4_put_stid(&open_stp->st_stid);
 	nfsd4_bump_seqid(cstate, status);
-	if (file_lock)
-		locks_free_lock(file_lock);
 	if (conflock)
 		locks_free_lock(conflock);
 	return status;
@@ -6768,6 +6943,7 @@ static int nfs4_state_create_net(struct net *net)
 	INIT_LIST_HEAD(&nn->client_lru);
 	INIT_LIST_HEAD(&nn->close_lru);
 	INIT_LIST_HEAD(&nn->del_recall_lru);
+	INIT_LIST_HEAD(&nn->blocked_locks_lru);
 	spin_lock_init(&nn->client_lock);
 
 	INIT_DELAYED_WORK(&nn->laundromat_work, laundromat_main);
@@ -6865,6 +7041,7 @@ nfs4_state_shutdown_net(struct net *net)
 	struct nfs4_delegation *dp = NULL;
 	struct list_head *pos, *next, reaplist;
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
+	struct nfsd4_blocked_lock *nbl;
 
 	cancel_delayed_work_sync(&nn->laundromat_work);
 	locks_end_grace(&nn->nfsd4_manager);
@@ -6885,6 +7062,24 @@ nfs4_state_shutdown_net(struct net *net)
 		nfs4_put_stid(&dp->dl_stid);
 	}
 
+	BUG_ON(!list_empty(&reaplist));
+	spin_lock(&nn->client_lock);
+	while (!list_empty(&nn->blocked_locks_lru)) {
+		nbl = list_first_entry(&nn->blocked_locks_lru,
+					struct nfsd4_blocked_lock, nbl_lru);
+		list_move(&nbl->nbl_lru, &reaplist);
+		list_del_init(&nbl->nbl_list);
+	}
+	spin_unlock(&nn->client_lock);
+
+	while (!list_empty(&reaplist)) {
+		nbl = list_first_entry(&nn->blocked_locks_lru,
+					struct nfsd4_blocked_lock, nbl_lru);
+		list_del_init(&nbl->nbl_lru);
+		posix_unblock_lock(&nbl->nbl_lock);
+		free_blocked_lock(nbl);
+	}
+
 	nfsd4_client_tracking_exit(net);
 	nfs4_state_destroy_net(net);
 }
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 0aa0236a1429..c2d2895a1ec1 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -1694,6 +1694,30 @@ nfsd4_decode_clone(struct nfsd4_compoundargs *argp, struct nfsd4_clone *clone)
 }
 
 static __be32
+nfsd4_decode_copy(struct nfsd4_compoundargs *argp, struct nfsd4_copy *copy)
+{
+	DECODE_HEAD;
+	unsigned int tmp;
+
+	status = nfsd4_decode_stateid(argp, &copy->cp_src_stateid);
+	if (status)
+		return status;
+	status = nfsd4_decode_stateid(argp, &copy->cp_dst_stateid);
+	if (status)
+		return status;
+
+	READ_BUF(8 + 8 + 8 + 4 + 4 + 4);
+	p = xdr_decode_hyper(p, &copy->cp_src_pos);
+	p = xdr_decode_hyper(p, &copy->cp_dst_pos);
+	p = xdr_decode_hyper(p, &copy->cp_count);
+	copy->cp_consecutive = be32_to_cpup(p++);
+	copy->cp_synchronous = be32_to_cpup(p++);
+	tmp = be32_to_cpup(p); /* Source server list not supported */
+
+	DECODE_TAIL;
+}
+
+static __be32
 nfsd4_decode_seek(struct nfsd4_compoundargs *argp, struct nfsd4_seek *seek)
 {
 	DECODE_HEAD;
@@ -1793,7 +1817,7 @@ static nfsd4_dec nfsd4_dec_ops[] = {
 
 	/* new operations for NFSv4.2 */
 	[OP_ALLOCATE]		= (nfsd4_dec)nfsd4_decode_fallocate,
-	[OP_COPY]		= (nfsd4_dec)nfsd4_decode_notsupp,
+	[OP_COPY]		= (nfsd4_dec)nfsd4_decode_copy,
 	[OP_COPY_NOTIFY]	= (nfsd4_dec)nfsd4_decode_notsupp,
 	[OP_DEALLOCATE]		= (nfsd4_dec)nfsd4_decode_fallocate,
 	[OP_IO_ADVISE]		= (nfsd4_dec)nfsd4_decode_notsupp,
@@ -4062,7 +4086,7 @@ nfsd4_encode_getdeviceinfo(struct nfsd4_compoundres *resp, __be32 nfserr,
 	u32 starting_len = xdr->buf->len, needed_len;
 	__be32 *p;
 
-	dprintk("%s: err %d\n", __func__, nfserr);
+	dprintk("%s: err %d\n", __func__, be32_to_cpu(nfserr));
 	if (nfserr)
 		goto out;
 
@@ -4202,6 +4226,41 @@ nfsd4_encode_layoutreturn(struct nfsd4_compoundres *resp, __be32 nfserr,
 #endif /* CONFIG_NFSD_PNFS */
 
 static __be32
+nfsd42_encode_write_res(struct nfsd4_compoundres *resp, struct nfsd42_write_res *write)
+{
+	__be32 *p;
+
+	p = xdr_reserve_space(&resp->xdr, 4 + 8 + 4 + NFS4_VERIFIER_SIZE);
+	if (!p)
+		return nfserr_resource;
+
+	*p++ = cpu_to_be32(0);
+	p = xdr_encode_hyper(p, write->wr_bytes_written);
+	*p++ = cpu_to_be32(write->wr_stable_how);
+	p = xdr_encode_opaque_fixed(p, write->wr_verifier.data,
+				    NFS4_VERIFIER_SIZE);
+	return nfs_ok;
+}
+
+static __be32
+nfsd4_encode_copy(struct nfsd4_compoundres *resp, __be32 nfserr,
+		  struct nfsd4_copy *copy)
+{
+	__be32 *p;
+
+	if (!nfserr) {
+		nfserr = nfsd42_encode_write_res(resp, &copy->cp_res);
+		if (nfserr)
+			return nfserr;
+
+		p = xdr_reserve_space(&resp->xdr, 4 + 4);
+		*p++ = cpu_to_be32(copy->cp_consecutive);
+		*p++ = cpu_to_be32(copy->cp_synchronous);
+	}
+	return nfserr;
+}
+
+static __be32
 nfsd4_encode_seek(struct nfsd4_compoundres *resp, __be32 nfserr,
 		  struct nfsd4_seek *seek)
 {
@@ -4300,7 +4359,7 @@ static nfsd4_enc nfsd4_enc_ops[] = {
 
 	/* NFSv4.2 operations */
 	[OP_ALLOCATE]		= (nfsd4_enc)nfsd4_encode_noop,
-	[OP_COPY]		= (nfsd4_enc)nfsd4_encode_noop,
+	[OP_COPY]		= (nfsd4_enc)nfsd4_encode_copy,
 	[OP_COPY_NOTIFY]	= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_DEALLOCATE]		= (nfsd4_enc)nfsd4_encode_noop,
 	[OP_IO_ADVISE]		= (nfsd4_enc)nfsd4_encode_noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index 65ad0165a94f..36b2af931e06 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -1216,6 +1216,8 @@ static __net_init int nfsd_init_net(struct net *net)
 		goto out_idmap_error;
 	nn->nfsd4_lease = 90;	/* default lease time */
 	nn->nfsd4_grace = 90;
+	nn->clverifier_counter = prandom_u32();
+	nn->clientid_counter = prandom_u32();
 	return 0;
 
 out_idmap_error:
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 08188743db53..010aff5c5a79 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -789,6 +789,7 @@ nfserrno (int errno)
 		{ nfserr_toosmall, -ETOOSMALL },
 		{ nfserr_serverfault, -ESERVERFAULT },
 		{ nfserr_serverfault, -ENFILE },
+		{ nfserr_io, -EUCLEAN },
 	};
 	int	i;
 
@@ -796,7 +797,7 @@ nfserrno (int errno)
 		if (nfs_errtbl[i].syserr == errno)
 			return nfs_errtbl[i].nfserr;
 	}
-	WARN(1, "nfsd: non-standard errno: %d\n", errno);
+	WARN_ONCE(1, "nfsd: non-standard errno: %d\n", errno);
 	return nfserr_io;
 }
 
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 45007acaf364..a2b65fc56dd6 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -366,14 +366,21 @@ static struct notifier_block nfsd_inet6addr_notifier = {
 };
 #endif
 
+/* Only used under nfsd_mutex, so this atomic may be overkill: */
+static atomic_t nfsd_notifier_refcount = ATOMIC_INIT(0);
+
 static void nfsd_last_thread(struct svc_serv *serv, struct net *net)
 {
 	struct nfsd_net *nn = net_generic(net, nfsd_net_id);
 
-	unregister_inetaddr_notifier(&nfsd_inetaddr_notifier);
+	/* check if the notifier still has clients */
+	if (atomic_dec_return(&nfsd_notifier_refcount) == 0) {
+		unregister_inetaddr_notifier(&nfsd_inetaddr_notifier);
 #if IS_ENABLED(CONFIG_IPV6)
-	unregister_inet6addr_notifier(&nfsd_inet6addr_notifier);
+		unregister_inet6addr_notifier(&nfsd_inet6addr_notifier);
 #endif
+	}
+
 	/*
 	 * write_ports can create the server without actually starting
 	 * any threads--if we get shut down before any threads are
@@ -488,10 +495,13 @@ int nfsd_create_serv(struct net *net)
 	}
 
 	set_max_drc();
-	register_inetaddr_notifier(&nfsd_inetaddr_notifier);
+	/* check if the notifier is already set */
+	if (atomic_inc_return(&nfsd_notifier_refcount) == 1) {
+		register_inetaddr_notifier(&nfsd_inetaddr_notifier);
 #if IS_ENABLED(CONFIG_IPV6)
-	register_inet6addr_notifier(&nfsd_inet6addr_notifier);
+		register_inet6addr_notifier(&nfsd_inet6addr_notifier);
 #endif
+	}
 	do_gettimeofday(&nn->nfssvc_boot);		/* record boot time */
 	return 0;
 }
diff --git a/fs/nfsd/pnfs.h b/fs/nfsd/pnfs.h
index 0c2a716e8741..d27a5aa60022 100644
--- a/fs/nfsd/pnfs.h
+++ b/fs/nfsd/pnfs.h
@@ -19,6 +19,7 @@ struct nfsd4_deviceid_map {
 
 struct nfsd4_layout_ops {
 	u32		notify_types;
+	bool		disable_recalls;
 
 	__be32 (*proc_getdeviceinfo)(struct super_block *sb,
 			struct svc_rqst *rqstp,
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index b95adf9a1595..c9399366f9df 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -63,7 +63,6 @@ typedef struct {
 
 struct nfsd4_callback {
 	struct nfs4_client *cb_clp;
-	u32 cb_minorversion;
 	struct rpc_message cb_msg;
 	const struct nfsd4_callback_ops *cb_ops;
 	struct work_struct cb_work;
@@ -441,11 +440,11 @@ struct nfs4_openowner {
 /*
  * Represents a generic "lockowner". Similar to an openowner. References to it
  * are held by the lock stateids that are created on its behalf. This object is
- * a superset of the nfs4_stateowner struct (or would be if it needed any extra
- * fields).
+ * a superset of the nfs4_stateowner struct.
  */
 struct nfs4_lockowner {
-	struct nfs4_stateowner	lo_owner; /* must be first element */
+	struct nfs4_stateowner	lo_owner;	/* must be first element */
+	struct list_head	lo_blocked;	/* blocked file_locks */
 };
 
 static inline struct nfs4_openowner * openowner(struct nfs4_stateowner *so)
@@ -572,6 +571,7 @@ enum nfsd4_cb_op {
 	NFSPROC4_CLNT_CB_RECALL,
 	NFSPROC4_CLNT_CB_LAYOUT,
 	NFSPROC4_CLNT_CB_SEQUENCE,
+	NFSPROC4_CLNT_CB_NOTIFY_LOCK,
 };
 
 /* Returns true iff a is later than b: */
@@ -580,6 +580,20 @@ static inline bool nfsd4_stateid_generation_after(stateid_t *a, stateid_t *b)
 	return (s32)(a->si_generation - b->si_generation) > 0;
 }
 
+/*
+ * When a client tries to get a lock on a file, we set one of these objects
+ * on the blocking lock. When the lock becomes free, we can then issue a
+ * CB_NOTIFY_LOCK to the server.
+ */
+struct nfsd4_blocked_lock {
+	struct list_head	nbl_list;
+	struct list_head	nbl_lru;
+	unsigned long		nbl_time;
+	struct file_lock	nbl_lock;
+	struct knfsd_fh		nbl_fh;
+	struct nfsd4_callback	nbl_cb;
+};
+
 struct nfsd4_compound_state;
 struct nfsd_net;
 
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index ff476e654b8f..8ca642fe9b21 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -513,6 +513,22 @@ __be32 nfsd4_clone_file_range(struct file *src, u64 src_pos, struct file *dst,
 			count));
 }
 
+ssize_t nfsd_copy_file_range(struct file *src, u64 src_pos, struct file *dst,
+			     u64 dst_pos, u64 count)
+{
+
+	/*
+	 * Limit copy to 4MB to prevent indefinitely blocking an nfsd
+	 * thread and client rpc slot.  The choice of 4MB is somewhat
+	 * arbitrary.  We might instead base this on r/wsize, or make it
+	 * tunable, or use a time instead of a byte limit, or implement
+	 * asynchronous copy.  In theory a client could also recognize a
+	 * limit like this and pipeline multiple COPY requests.
+	 */
+	count = min_t(u64, count, 1 << 22);
+	return vfs_copy_file_range(src, src_pos, dst, dst_pos, count, 0);
+}
+
 __be32 nfsd4_vfs_fallocate(struct svc_rqst *rqstp, struct svc_fh *fhp,
 			   struct file *file, loff_t offset, loff_t len,
 			   int flags)
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index 3cbb1b33777b..0bf9e7bf5800 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -96,6 +96,8 @@ __be32		nfsd_symlink(struct svc_rqst *, struct svc_fh *,
 				struct svc_fh *res);
 __be32		nfsd_link(struct svc_rqst *, struct svc_fh *,
 				char *, int, struct svc_fh *);
+ssize_t		nfsd_copy_file_range(struct file *, u64,
+				     struct file *, u64, u64);
 __be32		nfsd_rename(struct svc_rqst *,
 				struct svc_fh *, char *, int,
 				struct svc_fh *, char *, int);
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index beea0c5edc51..8fda4abdf3b1 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -503,6 +503,28 @@ struct nfsd4_clone {
 	u64		cl_count;
 };
 
+struct nfsd42_write_res {
+	u64			wr_bytes_written;
+	u32			wr_stable_how;
+	nfs4_verifier		wr_verifier;
+};
+
+struct nfsd4_copy {
+	/* request */
+	stateid_t	cp_src_stateid;
+	stateid_t	cp_dst_stateid;
+	u64		cp_src_pos;
+	u64		cp_dst_pos;
+	u64		cp_count;
+
+	/* both */
+	bool		cp_consecutive;
+	bool		cp_synchronous;
+
+	/* response */
+	struct nfsd42_write_res	cp_res;
+};
+
 struct nfsd4_seek {
 	/* request */
 	stateid_t	seek_stateid;
@@ -568,6 +590,7 @@ struct nfsd4_op {
 		struct nfsd4_fallocate		allocate;
 		struct nfsd4_fallocate		deallocate;
 		struct nfsd4_clone		clone;
+		struct nfsd4_copy		copy;
 		struct nfsd4_seek		seek;
 	} u;
 	struct nfs4_replay *			replay;
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index c47f6fdb111a..49b719dfef95 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -28,3 +28,12 @@
 #define NFS4_dec_cb_layout_sz		(cb_compound_dec_hdr_sz  +      \
 					cb_sequence_dec_sz +            \
 					op_dec_sz)
+
+#define NFS4_enc_cb_notify_lock_sz	(cb_compound_enc_hdr_sz +        \
+					cb_sequence_enc_sz +             \
+					2 + 1 +				 \
+					XDR_QUADLEN(NFS4_OPAQUE_LIMIT) + \
+					enc_nfs4_fh_sz)
+#define NFS4_dec_cb_notify_lock_sz	(cb_compound_dec_hdr_sz  +      \
+					cb_sequence_dec_sz +            \
+					op_dec_sz)
diff --git a/fs/open.c b/fs/open.c
index a7719cfb7257..d3ed8171e8e0 100644
--- a/fs/open.c
+++ b/fs/open.c
@@ -267,6 +267,11 @@ int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len)
 	    (mode & ~FALLOC_FL_INSERT_RANGE))
 		return -EINVAL;
 
+	/* Unshare range should only be used with allocate mode. */
+	if ((mode & FALLOC_FL_UNSHARE_RANGE) &&
+	    (mode & ~(FALLOC_FL_UNSHARE_RANGE | FALLOC_FL_KEEP_SIZE)))
+		return -EINVAL;
+
 	if (!(file->f_mode & FMODE_WRITE))
 		return -EBADF;
 
diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile
index 584e87e11cb6..26ef1958b65b 100644
--- a/fs/xfs/Makefile
+++ b/fs/xfs/Makefile
@@ -55,6 +55,8 @@ xfs-y				+= $(addprefix libxfs/, \
 				   xfs_ag_resv.o \
 				   xfs_rmap.o \
 				   xfs_rmap_btree.o \
+				   xfs_refcount.o \
+				   xfs_refcount_btree.o \
 				   xfs_sb.o \
 				   xfs_symlink_remote.o \
 				   xfs_trans_resv.o \
@@ -88,6 +90,7 @@ xfs-y				+= xfs_aops.o \
 				   xfs_message.o \
 				   xfs_mount.o \
 				   xfs_mru_cache.o \
+				   xfs_reflink.o \
 				   xfs_stats.o \
 				   xfs_super.o \
 				   xfs_symlink.o \
@@ -100,16 +103,20 @@ xfs-y				+= xfs_aops.o \
 # low-level transaction/log code
 xfs-y				+= xfs_log.o \
 				   xfs_log_cil.o \
+				   xfs_bmap_item.o \
 				   xfs_buf_item.o \
 				   xfs_extfree_item.o \
 				   xfs_icreate_item.o \
 				   xfs_inode_item.o \
+				   xfs_refcount_item.o \
 				   xfs_rmap_item.o \
 				   xfs_log_recover.o \
 				   xfs_trans_ail.o \
+				   xfs_trans_bmap.o \
 				   xfs_trans_buf.o \
 				   xfs_trans_extfree.o \
 				   xfs_trans_inode.o \
+				   xfs_trans_refcount.o \
 				   xfs_trans_rmap.o \
 
 # optional features
diff --git a/fs/xfs/libxfs/xfs_ag_resv.c b/fs/xfs/libxfs/xfs_ag_resv.c
index e3ae0f2b4294..e5ebc3770460 100644
--- a/fs/xfs/libxfs/xfs_ag_resv.c
+++ b/fs/xfs/libxfs/xfs_ag_resv.c
@@ -38,6 +38,7 @@
 #include "xfs_trans_space.h"
 #include "xfs_rmap_btree.h"
 #include "xfs_btree.h"
+#include "xfs_refcount_btree.h"
 
 /*
  * Per-AG Block Reservations
@@ -108,7 +109,9 @@ xfs_ag_resv_critical(
 	trace_xfs_ag_resv_critical(pag, type, avail);
 
 	/* Critically low if less than 10% or max btree height remains. */
-	return avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS;
+	return XFS_TEST_ERROR(avail < orig / 10 || avail < XFS_BTREE_MAXLEVELS,
+			pag->pag_mount, XFS_ERRTAG_AG_RESV_CRITICAL,
+			XFS_RANDOM_AG_RESV_CRITICAL);
 }
 
 /*
@@ -228,6 +231,11 @@ xfs_ag_resv_init(
 	if (pag->pag_meta_resv.ar_asked == 0) {
 		ask = used = 0;
 
+		error = xfs_refcountbt_calc_reserves(pag->pag_mount,
+				pag->pag_agno, &ask, &used);
+		if (error)
+			goto out;
+
 		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_METADATA,
 				ask, used);
 		if (error)
@@ -238,6 +246,11 @@ xfs_ag_resv_init(
 	if (pag->pag_agfl_resv.ar_asked == 0) {
 		ask = used = 0;
 
+		error = xfs_rmapbt_calc_reserves(pag->pag_mount, pag->pag_agno,
+				&ask, &used);
+		if (error)
+			goto out;
+
 		error = __xfs_ag_resv_init(pag, XFS_AG_RESV_AGFL, ask, used);
 		if (error)
 			goto out;
diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c
index ca75dc90ebe0..effb64cf714f 100644
--- a/fs/xfs/libxfs/xfs_alloc.c
+++ b/fs/xfs/libxfs/xfs_alloc.c
@@ -52,10 +52,23 @@ STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
 STATIC int xfs_alloc_ag_vextent_small(xfs_alloc_arg_t *,
 		xfs_btree_cur_t *, xfs_agblock_t *, xfs_extlen_t *, int *);
 
+unsigned int
+xfs_refc_block(
+	struct xfs_mount	*mp)
+{
+	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+		return XFS_RMAP_BLOCK(mp) + 1;
+	if (xfs_sb_version_hasfinobt(&mp->m_sb))
+		return XFS_FIBT_BLOCK(mp) + 1;
+	return XFS_IBT_BLOCK(mp) + 1;
+}
+
 xfs_extlen_t
 xfs_prealloc_blocks(
 	struct xfs_mount	*mp)
 {
+	if (xfs_sb_version_hasreflink(&mp->m_sb))
+		return xfs_refc_block(mp) + 1;
 	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
 		return XFS_RMAP_BLOCK(mp) + 1;
 	if (xfs_sb_version_hasfinobt(&mp->m_sb))
@@ -115,6 +128,8 @@ xfs_alloc_ag_max_usable(
 		blocks++;		/* finobt root block */
 	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
 		blocks++; 		/* rmap root block */
+	if (xfs_sb_version_hasreflink(&mp->m_sb))
+		blocks++;		/* refcount root block */
 
 	return mp->m_sb.sb_agblocks - blocks;
 }
@@ -2321,6 +2336,9 @@ xfs_alloc_log_agf(
 		offsetof(xfs_agf_t, agf_btreeblks),
 		offsetof(xfs_agf_t, agf_uuid),
 		offsetof(xfs_agf_t, agf_rmap_blocks),
+		offsetof(xfs_agf_t, agf_refcount_blocks),
+		offsetof(xfs_agf_t, agf_refcount_root),
+		offsetof(xfs_agf_t, agf_refcount_level),
 		/* needed so that we don't log the whole rest of the structure: */
 		offsetof(xfs_agf_t, agf_spare64),
 		sizeof(xfs_agf_t)
@@ -2458,6 +2476,10 @@ xfs_agf_verify(
 	    be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
 		return false;
 
+	if (xfs_sb_version_hasreflink(&mp->m_sb) &&
+	    be32_to_cpu(agf->agf_refcount_level) > XFS_BTREE_MAXLEVELS)
+		return false;
+
 	return true;;
 
 }
@@ -2578,6 +2600,7 @@ xfs_alloc_read_agf(
 			be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
 		pag->pagf_levels[XFS_BTNUM_RMAPi] =
 			be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
+		pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
 		spin_lock_init(&pag->pagb_lock);
 		pag->pagb_count = 0;
 		pag->pagb_tree = RB_ROOT;
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index 9d7f61d36645..c27344cf38e1 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -48,6 +48,7 @@
 #include "xfs_filestream.h"
 #include "xfs_rmap.h"
 #include "xfs_ag_resv.h"
+#include "xfs_refcount.h"
 
 
 kmem_zone_t		*xfs_bmap_free_item_zone;
@@ -140,7 +141,8 @@ xfs_bmbt_lookup_ge(
  */
 static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
 {
-	return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
+	return whichfork != XFS_COW_FORK &&
+		XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS &&
 		XFS_IFORK_NEXTENTS(ip, whichfork) >
 			XFS_IFORK_MAXEXT(ip, whichfork);
 }
@@ -150,7 +152,8 @@ static inline bool xfs_bmap_needs_btree(struct xfs_inode *ip, int whichfork)
  */
 static inline bool xfs_bmap_wants_extents(struct xfs_inode *ip, int whichfork)
 {
-	return XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
+	return whichfork != XFS_COW_FORK &&
+		XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE &&
 		XFS_IFORK_NEXTENTS(ip, whichfork) <=
 			XFS_IFORK_MAXEXT(ip, whichfork);
 }
@@ -640,6 +643,7 @@ xfs_bmap_btree_to_extents(
 
 	mp = ip->i_mount;
 	ifp = XFS_IFORK_PTR(ip, whichfork);
+	ASSERT(whichfork != XFS_COW_FORK);
 	ASSERT(ifp->if_flags & XFS_IFEXTENTS);
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_BTREE);
 	rblock = ifp->if_broot;
@@ -706,6 +710,7 @@ xfs_bmap_extents_to_btree(
 	xfs_bmbt_ptr_t		*pp;		/* root block address pointer */
 
 	mp = ip->i_mount;
+	ASSERT(whichfork != XFS_COW_FORK);
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_EXTENTS);
 
@@ -748,6 +753,7 @@ xfs_bmap_extents_to_btree(
 		args.type = XFS_ALLOCTYPE_START_BNO;
 		args.fsbno = XFS_INO_TO_FSB(mp, ip->i_ino);
 	} else if (dfops->dop_low) {
+try_another_ag:
 		args.type = XFS_ALLOCTYPE_START_BNO;
 		args.fsbno = *firstblock;
 	} else {
@@ -762,6 +768,21 @@ xfs_bmap_extents_to_btree(
 		xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
 		return error;
 	}
+
+	/*
+	 * During a CoW operation, the allocation and bmbt updates occur in
+	 * different transactions.  The mapping code tries to put new bmbt
+	 * blocks near extents being mapped, but the only way to guarantee this
+	 * is if the alloc and the mapping happen in a single transaction that
+	 * has a block reservation.  That isn't the case here, so if we run out
+	 * of space we'll try again with another AG.
+	 */
+	if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
+	    args.fsbno == NULLFSBLOCK &&
+	    args.type == XFS_ALLOCTYPE_NEAR_BNO) {
+		dfops->dop_low = true;
+		goto try_another_ag;
+	}
 	/*
 	 * Allocation can't fail, the space was reserved.
 	 */
@@ -837,6 +858,7 @@ xfs_bmap_local_to_extents_empty(
 {
 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
 
+	ASSERT(whichfork != XFS_COW_FORK);
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) == XFS_DINODE_FMT_LOCAL);
 	ASSERT(ifp->if_bytes == 0);
 	ASSERT(XFS_IFORK_NEXTENTS(ip, whichfork) == 0);
@@ -896,6 +918,7 @@ xfs_bmap_local_to_extents(
 	 * file currently fits in an inode.
 	 */
 	if (*firstblock == NULLFSBLOCK) {
+try_another_ag:
 		args.fsbno = XFS_INO_TO_FSB(args.mp, ip->i_ino);
 		args.type = XFS_ALLOCTYPE_START_BNO;
 	} else {
@@ -908,6 +931,19 @@ xfs_bmap_local_to_extents(
 	if (error)
 		goto done;
 
+	/*
+	 * During a CoW operation, the allocation and bmbt updates occur in
+	 * different transactions.  The mapping code tries to put new bmbt
+	 * blocks near extents being mapped, but the only way to guarantee this
+	 * is if the alloc and the mapping happen in a single transaction that
+	 * has a block reservation.  That isn't the case here, so if we run out
+	 * of space we'll try again with another AG.
+	 */
+	if (xfs_sb_version_hasreflink(&ip->i_mount->m_sb) &&
+	    args.fsbno == NULLFSBLOCK &&
+	    args.type == XFS_ALLOCTYPE_NEAR_BNO) {
+		goto try_another_ag;
+	}
 	/* Can't fail, the space was reserved. */
 	ASSERT(args.fsbno != NULLFSBLOCK);
 	ASSERT(args.len == 1);
@@ -1670,7 +1706,8 @@ xfs_bmap_one_block(
  */
 STATIC int				/* error */
 xfs_bmap_add_extent_delay_real(
-	struct xfs_bmalloca	*bma)
+	struct xfs_bmalloca	*bma,
+	int			whichfork)
 {
 	struct xfs_bmbt_irec	*new = &bma->got;
 	int			diff;	/* temp value */
@@ -1688,11 +1725,14 @@ xfs_bmap_add_extent_delay_real(
 	xfs_filblks_t		temp=0;	/* value for da_new calculations */
 	xfs_filblks_t		temp2=0;/* value for da_new calculations */
 	int			tmp_rval;	/* partial logging flags */
-	int			whichfork = XFS_DATA_FORK;
 	struct xfs_mount	*mp;
+	xfs_extnum_t		*nextents;
 
 	mp = bma->ip->i_mount;
 	ifp = XFS_IFORK_PTR(bma->ip, whichfork);
+	ASSERT(whichfork != XFS_ATTR_FORK);
+	nextents = (whichfork == XFS_COW_FORK ? &bma->ip->i_cnextents :
+						&bma->ip->i_d.di_nextents);
 
 	ASSERT(bma->idx >= 0);
 	ASSERT(bma->idx <= ifp->if_bytes / sizeof(struct xfs_bmbt_rec));
@@ -1706,6 +1746,9 @@ xfs_bmap_add_extent_delay_real(
 #define	RIGHT		r[1]
 #define	PREV		r[2]
 
+	if (whichfork == XFS_COW_FORK)
+		state |= BMAP_COWFORK;
+
 	/*
 	 * Set up a bunch of variables to make the tests simpler.
 	 */
@@ -1792,7 +1835,7 @@ xfs_bmap_add_extent_delay_real(
 		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
 		xfs_iext_remove(bma->ip, bma->idx + 1, 2, state);
-		bma->ip->i_d.di_nextents--;
+		(*nextents)--;
 		if (bma->cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@@ -1894,7 +1937,7 @@ xfs_bmap_add_extent_delay_real(
 		xfs_bmbt_set_startblock(ep, new->br_startblock);
 		trace_xfs_bmap_post_update(bma->ip, bma->idx, state, _THIS_IP_);
 
-		bma->ip->i_d.di_nextents++;
+		(*nextents)++;
 		if (bma->cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@@ -1964,7 +2007,7 @@ xfs_bmap_add_extent_delay_real(
 		temp = PREV.br_blockcount - new->br_blockcount;
 		xfs_bmbt_set_blockcount(ep, temp);
 		xfs_iext_insert(bma->ip, bma->idx, 1, new, state);
-		bma->ip->i_d.di_nextents++;
+		(*nextents)++;
 		if (bma->cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@@ -2048,7 +2091,7 @@ xfs_bmap_add_extent_delay_real(
 		trace_xfs_bmap_pre_update(bma->ip, bma->idx, state, _THIS_IP_);
 		xfs_bmbt_set_blockcount(ep, temp);
 		xfs_iext_insert(bma->ip, bma->idx + 1, 1, new, state);
-		bma->ip->i_d.di_nextents++;
+		(*nextents)++;
 		if (bma->cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@@ -2117,7 +2160,7 @@ xfs_bmap_add_extent_delay_real(
 		RIGHT.br_blockcount = temp2;
 		/* insert LEFT (r[0]) and RIGHT (r[1]) at the same time */
 		xfs_iext_insert(bma->ip, bma->idx + 1, 2, &LEFT, state);
-		bma->ip->i_d.di_nextents++;
+		(*nextents)++;
 		if (bma->cur == NULL)
 			rval = XFS_ILOG_CORE | XFS_ILOG_DEXT;
 		else {
@@ -2215,7 +2258,8 @@ xfs_bmap_add_extent_delay_real(
 
 	xfs_bmap_check_leaf_extents(bma->cur, bma->ip, whichfork);
 done:
-	bma->logflags |= rval;
+	if (whichfork != XFS_COW_FORK)
+		bma->logflags |= rval;
 	return error;
 #undef	LEFT
 #undef	RIGHT
@@ -2759,6 +2803,7 @@ done:
 STATIC void
 xfs_bmap_add_extent_hole_delay(
 	xfs_inode_t		*ip,	/* incore inode pointer */
+	int			whichfork,
 	xfs_extnum_t		*idx,	/* extent number to update/insert */
 	xfs_bmbt_irec_t		*new)	/* new data to add to file extents */
 {
@@ -2770,8 +2815,10 @@ xfs_bmap_add_extent_hole_delay(
 	int			state;  /* state bits, accessed thru macros */
 	xfs_filblks_t		temp=0;	/* temp for indirect calculations */
 
-	ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	ifp = XFS_IFORK_PTR(ip, whichfork);
 	state = 0;
+	if (whichfork == XFS_COW_FORK)
+		state |= BMAP_COWFORK;
 	ASSERT(isnullstartblock(new->br_startblock));
 
 	/*
@@ -2789,7 +2836,7 @@ xfs_bmap_add_extent_hole_delay(
 	 * Check and set flags if the current (right) segment exists.
 	 * If it doesn't exist, we're converting the hole at end-of-file.
 	 */
-	if (*idx < ip->i_df.if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
+	if (*idx < ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t)) {
 		state |= BMAP_RIGHT_VALID;
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, *idx), &right);
 
@@ -2923,6 +2970,7 @@ xfs_bmap_add_extent_hole_real(
 	ASSERT(!isnullstartblock(new->br_startblock));
 	ASSERT(!bma->cur ||
 	       !(bma->cur->bc_private.b.flags & XFS_BTCUR_BPRV_WASDEL));
+	ASSERT(whichfork != XFS_COW_FORK);
 
 	XFS_STATS_INC(mp, xs_add_exlist);
 
@@ -3648,7 +3696,9 @@ xfs_bmap_btalloc(
 	else if (mp->m_dalign)
 		stripe_align = mp->m_dalign;
 
-	if (xfs_alloc_is_userdata(ap->datatype))
+	if (ap->flags & XFS_BMAPI_COWFORK)
+		align = xfs_get_cowextsz_hint(ap->ip);
+	else if (xfs_alloc_is_userdata(ap->datatype))
 		align = xfs_get_extsz_hint(ap->ip);
 	if (unlikely(align)) {
 		error = xfs_bmap_extsize_align(mp, &ap->got, &ap->prev,
@@ -3856,7 +3906,8 @@ xfs_bmap_btalloc(
 		ASSERT(nullfb || fb_agno == args.agno ||
 		       (ap->dfops->dop_low && fb_agno < args.agno));
 		ap->length = args.len;
-		ap->ip->i_d.di_nblocks += args.len;
+		if (!(ap->flags & XFS_BMAPI_COWFORK))
+			ap->ip->i_d.di_nblocks += args.len;
 		xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
 		if (ap->wasdel)
 			ap->ip->i_delayed_blks -= args.len;
@@ -3876,6 +3927,63 @@ xfs_bmap_btalloc(
 }
 
 /*
+ * For a remap operation, just "allocate" an extent at the address that the
+ * caller passed in, and ensure that the AGFL is the right size.  The caller
+ * will then map the "allocated" extent into the file somewhere.
+ */
+STATIC int
+xfs_bmap_remap_alloc(
+	struct xfs_bmalloca	*ap)
+{
+	struct xfs_trans	*tp = ap->tp;
+	struct xfs_mount	*mp = tp->t_mountp;
+	xfs_agblock_t		bno;
+	struct xfs_alloc_arg	args;
+	int			error;
+
+	/*
+	 * validate that the block number is legal - the enables us to detect
+	 * and handle a silent filesystem corruption rather than crashing.
+	 */
+	memset(&args, 0, sizeof(struct xfs_alloc_arg));
+	args.tp = ap->tp;
+	args.mp = ap->tp->t_mountp;
+	bno = *ap->firstblock;
+	args.agno = XFS_FSB_TO_AGNO(mp, bno);
+	args.agbno = XFS_FSB_TO_AGBNO(mp, bno);
+	if (args.agno >= mp->m_sb.sb_agcount ||
+	    args.agbno >= mp->m_sb.sb_agblocks)
+		return -EFSCORRUPTED;
+
+	/* "Allocate" the extent from the range we passed in. */
+	trace_xfs_bmap_remap_alloc(ap->ip, *ap->firstblock, ap->length);
+	ap->blkno = bno;
+	ap->ip->i_d.di_nblocks += ap->length;
+	xfs_trans_log_inode(ap->tp, ap->ip, XFS_ILOG_CORE);
+
+	/* Fix the freelist, like a real allocator does. */
+	args.datatype = ap->datatype;
+	args.pag = xfs_perag_get(args.mp, args.agno);
+	ASSERT(args.pag);
+
+	/*
+	 * The freelist fixing code will decline the allocation if
+	 * the size and shape of the free space doesn't allow for
+	 * allocating the extent and updating all the metadata that
+	 * happens during an allocation.  We're remapping, not
+	 * allocating, so skip that check by pretending to be freeing.
+	 */
+	error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
+	if (error)
+		goto error0;
+error0:
+	xfs_perag_put(args.pag);
+	if (error)
+		trace_xfs_bmap_remap_alloc_error(ap->ip, error, _RET_IP_);
+	return error;
+}
+
+/*
  * xfs_bmap_alloc is called by xfs_bmapi to allocate an extent for a file.
  * It figures out where to ask the underlying allocator to put the new extent.
  */
@@ -3883,6 +3991,8 @@ STATIC int
 xfs_bmap_alloc(
 	struct xfs_bmalloca	*ap)	/* bmap alloc argument struct */
 {
+	if (ap->flags & XFS_BMAPI_REMAP)
+		return xfs_bmap_remap_alloc(ap);
 	if (XFS_IS_REALTIME_INODE(ap->ip) &&
 	    xfs_alloc_is_userdata(ap->datatype))
 		return xfs_bmap_rtalloc(ap);
@@ -4012,12 +4122,11 @@ xfs_bmapi_read(
 	int			error;
 	int			eof;
 	int			n = 0;
-	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
-						XFS_ATTR_FORK : XFS_DATA_FORK;
+	int			whichfork = xfs_bmapi_whichfork(flags);
 
 	ASSERT(*nmap >= 1);
 	ASSERT(!(flags & ~(XFS_BMAPI_ATTRFORK|XFS_BMAPI_ENTIRE|
-			   XFS_BMAPI_IGSTATE)));
+			   XFS_BMAPI_IGSTATE|XFS_BMAPI_COWFORK)));
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_SHARED|XFS_ILOCK_EXCL));
 
 	if (unlikely(XFS_TEST_ERROR(
@@ -4035,6 +4144,16 @@ xfs_bmapi_read(
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 
+	/* No CoW fork?  Return a hole. */
+	if (whichfork == XFS_COW_FORK && !ifp) {
+		mval->br_startoff = bno;
+		mval->br_startblock = HOLESTARTBLOCK;
+		mval->br_blockcount = len;
+		mval->br_state = XFS_EXT_NORM;
+		*nmap = 1;
+		return 0;
+	}
+
 	if (!(ifp->if_flags & XFS_IFEXTENTS)) {
 		error = xfs_iread_extents(NULL, ip, whichfork);
 		if (error)
@@ -4084,6 +4203,7 @@ xfs_bmapi_read(
 int
 xfs_bmapi_reserve_delalloc(
 	struct xfs_inode	*ip,
+	int			whichfork,
 	xfs_fileoff_t		aoff,
 	xfs_filblks_t		len,
 	struct xfs_bmbt_irec	*got,
@@ -4092,7 +4212,7 @@ xfs_bmapi_reserve_delalloc(
 	int			eof)
 {
 	struct xfs_mount	*mp = ip->i_mount;
-	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
+	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
 	xfs_extlen_t		alen;
 	xfs_extlen_t		indlen;
 	char			rt = XFS_IS_REALTIME_INODE(ip);
@@ -4104,7 +4224,10 @@ xfs_bmapi_reserve_delalloc(
 		alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff);
 
 	/* Figure out the extent size, adjust alen */
-	extsz = xfs_get_extsz_hint(ip);
+	if (whichfork == XFS_COW_FORK)
+		extsz = xfs_get_cowextsz_hint(ip);
+	else
+		extsz = xfs_get_extsz_hint(ip);
 	if (extsz) {
 		error = xfs_bmap_extsize_align(mp, got, prev, extsz, rt, eof,
 					       1, 0, &aoff, &alen);
@@ -4151,7 +4274,7 @@ xfs_bmapi_reserve_delalloc(
 	got->br_startblock = nullstartblock(indlen);
 	got->br_blockcount = alen;
 	got->br_state = XFS_EXT_NORM;
-	xfs_bmap_add_extent_hole_delay(ip, lastx, got);
+	xfs_bmap_add_extent_hole_delay(ip, whichfork, lastx, got);
 
 	/*
 	 * Update our extent pointer, given that xfs_bmap_add_extent_hole_delay
@@ -4182,8 +4305,7 @@ xfs_bmapi_allocate(
 	struct xfs_bmalloca	*bma)
 {
 	struct xfs_mount	*mp = bma->ip->i_mount;
-	int			whichfork = (bma->flags & XFS_BMAPI_ATTRFORK) ?
-						XFS_ATTR_FORK : XFS_DATA_FORK;
+	int			whichfork = xfs_bmapi_whichfork(bma->flags);
 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
 	int			tmp_logflags = 0;
 	int			error;
@@ -4278,7 +4400,7 @@ xfs_bmapi_allocate(
 		bma->got.br_state = XFS_EXT_UNWRITTEN;
 
 	if (bma->wasdel)
-		error = xfs_bmap_add_extent_delay_real(bma);
+		error = xfs_bmap_add_extent_delay_real(bma, whichfork);
 	else
 		error = xfs_bmap_add_extent_hole_real(bma, whichfork);
 
@@ -4308,8 +4430,7 @@ xfs_bmapi_convert_unwritten(
 	xfs_filblks_t		len,
 	int			flags)
 {
-	int			whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
-						XFS_ATTR_FORK : XFS_DATA_FORK;
+	int			whichfork = xfs_bmapi_whichfork(flags);
 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(bma->ip, whichfork);
 	int			tmp_logflags = 0;
 	int			error;
@@ -4325,6 +4446,8 @@ xfs_bmapi_convert_unwritten(
 			(XFS_BMAPI_PREALLOC | XFS_BMAPI_CONVERT))
 		return 0;
 
+	ASSERT(whichfork != XFS_COW_FORK);
+
 	/*
 	 * Modify (by adding) the state flag, if writing.
 	 */
@@ -4431,8 +4554,7 @@ xfs_bmapi_write(
 	orig_mval = mval;
 	orig_nmap = *nmap;
 #endif
-	whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
-		XFS_ATTR_FORK : XFS_DATA_FORK;
+	whichfork = xfs_bmapi_whichfork(flags);
 
 	ASSERT(*nmap >= 1);
 	ASSERT(*nmap <= XFS_BMAP_MAX_NMAP);
@@ -4441,6 +4563,11 @@ xfs_bmapi_write(
 	ASSERT(len > 0);
 	ASSERT(XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_LOCAL);
 	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL));
+	ASSERT(!(flags & XFS_BMAPI_REMAP) || whichfork == XFS_DATA_FORK);
+	ASSERT(!(flags & XFS_BMAPI_PREALLOC) || !(flags & XFS_BMAPI_REMAP));
+	ASSERT(!(flags & XFS_BMAPI_CONVERT) || !(flags & XFS_BMAPI_REMAP));
+	ASSERT(!(flags & XFS_BMAPI_PREALLOC) || whichfork != XFS_COW_FORK);
+	ASSERT(!(flags & XFS_BMAPI_CONVERT) || whichfork != XFS_COW_FORK);
 
 	/* zeroing is for currently only for data extents, not metadata */
 	ASSERT((flags & (XFS_BMAPI_METADATA | XFS_BMAPI_ZERO)) !=
@@ -4502,6 +4629,14 @@ xfs_bmapi_write(
 		wasdelay = !inhole && isnullstartblock(bma.got.br_startblock);
 
 		/*
+		 * Make sure we only reflink into a hole.
+		 */
+		if (flags & XFS_BMAPI_REMAP)
+			ASSERT(inhole);
+		if (flags & XFS_BMAPI_COWFORK)
+			ASSERT(!inhole);
+
+		/*
 		 * First, deal with the hole before the allocated space
 		 * that we found, if any.
 		 */
@@ -4531,6 +4666,17 @@ xfs_bmapi_write(
 				goto error0;
 			if (bma.blkno == NULLFSBLOCK)
 				break;
+
+			/*
+			 * If this is a CoW allocation, record the data in
+			 * the refcount btree for orphan recovery.
+			 */
+			if (whichfork == XFS_COW_FORK) {
+				error = xfs_refcount_alloc_cow_extent(mp, dfops,
+						bma.blkno, bma.length);
+				if (error)
+					goto error0;
+			}
 		}
 
 		/* Deal with the allocated space we found.  */
@@ -4696,7 +4842,8 @@ xfs_bmap_del_extent(
 	xfs_btree_cur_t		*cur,	/* if null, not a btree */
 	xfs_bmbt_irec_t		*del,	/* data to remove from extents */
 	int			*logflagsp, /* inode logging flags */
-	int			whichfork) /* data or attr fork */
+	int			whichfork, /* data or attr fork */
+	int			bflags)	/* bmapi flags */
 {
 	xfs_filblks_t		da_new;	/* new delay-alloc indirect blocks */
 	xfs_filblks_t		da_old;	/* old delay-alloc indirect blocks */
@@ -4725,6 +4872,8 @@ xfs_bmap_del_extent(
 
 	if (whichfork == XFS_ATTR_FORK)
 		state |= BMAP_ATTRFORK;
+	else if (whichfork == XFS_COW_FORK)
+		state |= BMAP_COWFORK;
 
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	ASSERT((*idx >= 0) && (*idx < ifp->if_bytes /
@@ -4805,6 +4954,7 @@ xfs_bmap_del_extent(
 		/*
 		 * Matches the whole extent.  Delete the entry.
 		 */
+		trace_xfs_bmap_pre_update(ip, *idx, state, _THIS_IP_);
 		xfs_iext_remove(ip, *idx, 1,
 				whichfork == XFS_ATTR_FORK ? BMAP_ATTRFORK : 0);
 		--*idx;
@@ -4988,9 +5138,16 @@ xfs_bmap_del_extent(
 	/*
 	 * If we need to, add to list of extents to delete.
 	 */
-	if (do_fx)
-		xfs_bmap_add_free(mp, dfops, del->br_startblock,
-				del->br_blockcount, NULL);
+	if (do_fx && !(bflags & XFS_BMAPI_REMAP)) {
+		if (xfs_is_reflink_inode(ip) && whichfork == XFS_DATA_FORK) {
+			error = xfs_refcount_decrease_extent(mp, dfops, del);
+			if (error)
+				goto done;
+		} else
+			xfs_bmap_add_free(mp, dfops, del->br_startblock,
+					del->br_blockcount, NULL);
+	}
+
 	/*
 	 * Adjust inode # blocks in the file.
 	 */
@@ -4999,7 +5156,7 @@ xfs_bmap_del_extent(
 	/*
 	 * Adjust quota data.
 	 */
-	if (qfield)
+	if (qfield && !(bflags & XFS_BMAPI_REMAP))
 		xfs_trans_mod_dquot_byino(tp, ip, qfield, (long)-nblks);
 
 	/*
@@ -5014,6 +5171,175 @@ done:
 	return error;
 }
 
+/* Remove an extent from the CoW fork.  Similar to xfs_bmap_del_extent. */
+int
+xfs_bunmapi_cow(
+	struct xfs_inode		*ip,
+	struct xfs_bmbt_irec		*del)
+{
+	xfs_filblks_t			da_new;
+	xfs_filblks_t			da_old;
+	xfs_fsblock_t			del_endblock = 0;
+	xfs_fileoff_t			del_endoff;
+	int				delay;
+	struct xfs_bmbt_rec_host	*ep;
+	int				error;
+	struct xfs_bmbt_irec		got;
+	xfs_fileoff_t			got_endoff;
+	struct xfs_ifork		*ifp;
+	struct xfs_mount		*mp;
+	xfs_filblks_t			nblks;
+	struct xfs_bmbt_irec		new;
+	/* REFERENCED */
+	uint				qfield;
+	xfs_filblks_t			temp;
+	xfs_filblks_t			temp2;
+	int				state = BMAP_COWFORK;
+	int				eof;
+	xfs_extnum_t			eidx;
+
+	mp = ip->i_mount;
+	XFS_STATS_INC(mp, xs_del_exlist);
+
+	ep = xfs_bmap_search_extents(ip, del->br_startoff, XFS_COW_FORK, &eof,
+			&eidx, &got, &new);
+
+	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK); ifp = ifp;
+	ASSERT((eidx >= 0) && (eidx < ifp->if_bytes /
+		(uint)sizeof(xfs_bmbt_rec_t)));
+	ASSERT(del->br_blockcount > 0);
+	ASSERT(got.br_startoff <= del->br_startoff);
+	del_endoff = del->br_startoff + del->br_blockcount;
+	got_endoff = got.br_startoff + got.br_blockcount;
+	ASSERT(got_endoff >= del_endoff);
+	delay = isnullstartblock(got.br_startblock);
+	ASSERT(isnullstartblock(del->br_startblock) == delay);
+	qfield = 0;
+	error = 0;
+	/*
+	 * If deleting a real allocation, must free up the disk space.
+	 */
+	if (!delay) {
+		nblks = del->br_blockcount;
+		qfield = XFS_TRANS_DQ_BCOUNT;
+		/*
+		 * Set up del_endblock and cur for later.
+		 */
+		del_endblock = del->br_startblock + del->br_blockcount;
+		da_old = da_new = 0;
+	} else {
+		da_old = startblockval(got.br_startblock);
+		da_new = 0;
+		nblks = 0;
+	}
+	qfield = qfield;
+	nblks = nblks;
+
+	/*
+	 * Set flag value to use in switch statement.
+	 * Left-contig is 2, right-contig is 1.
+	 */
+	switch (((got.br_startoff == del->br_startoff) << 1) |
+		(got_endoff == del_endoff)) {
+	case 3:
+		/*
+		 * Matches the whole extent.  Delete the entry.
+		 */
+		xfs_iext_remove(ip, eidx, 1, BMAP_COWFORK);
+		--eidx;
+		break;
+
+	case 2:
+		/*
+		 * Deleting the first part of the extent.
+		 */
+		trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_);
+		xfs_bmbt_set_startoff(ep, del_endoff);
+		temp = got.br_blockcount - del->br_blockcount;
+		xfs_bmbt_set_blockcount(ep, temp);
+		if (delay) {
+			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+				da_old);
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+			trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_);
+			da_new = temp;
+			break;
+		}
+		xfs_bmbt_set_startblock(ep, del_endblock);
+		trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_);
+		break;
+
+	case 1:
+		/*
+		 * Deleting the last part of the extent.
+		 */
+		temp = got.br_blockcount - del->br_blockcount;
+		trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);
+		if (delay) {
+			temp = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp),
+				da_old);
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+			trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_);
+			da_new = temp;
+			break;
+		}
+		trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_);
+		break;
+
+	case 0:
+		/*
+		 * Deleting the middle of the extent.
+		 */
+		temp = del->br_startoff - got.br_startoff;
+		trace_xfs_bmap_pre_update(ip, eidx, state, _THIS_IP_);
+		xfs_bmbt_set_blockcount(ep, temp);
+		new.br_startoff = del_endoff;
+		temp2 = got_endoff - del_endoff;
+		new.br_blockcount = temp2;
+		new.br_state = got.br_state;
+		if (!delay) {
+			new.br_startblock = del_endblock;
+		} else {
+			temp = xfs_bmap_worst_indlen(ip, temp);
+			xfs_bmbt_set_startblock(ep, nullstartblock((int)temp));
+			temp2 = xfs_bmap_worst_indlen(ip, temp2);
+			new.br_startblock = nullstartblock((int)temp2);
+			da_new = temp + temp2;
+			while (da_new > da_old) {
+				if (temp) {
+					temp--;
+					da_new--;
+					xfs_bmbt_set_startblock(ep,
+						nullstartblock((int)temp));
+				}
+				if (da_new == da_old)
+					break;
+				if (temp2) {
+					temp2--;
+					da_new--;
+					new.br_startblock =
+						nullstartblock((int)temp2);
+				}
+			}
+		}
+		trace_xfs_bmap_post_update(ip, eidx, state, _THIS_IP_);
+		xfs_iext_insert(ip, eidx + 1, 1, &new, state);
+		++eidx;
+		break;
+	}
+
+	/*
+	 * Account for change in delayed indirect blocks.
+	 * Nothing to do for disk quota accounting here.
+	 */
+	ASSERT(da_old >= da_new);
+	if (da_old > da_new)
+		xfs_mod_fdblocks(mp, (int64_t)(da_old - da_new), false);
+
+	return error;
+}
+
 /*
  * Unmap (remove) blocks from a file.
  * If nexts is nonzero then the number of extents to remove is limited to
@@ -5021,17 +5347,16 @@ done:
  * *done is set.
  */
 int						/* error */
-xfs_bunmapi(
+__xfs_bunmapi(
 	xfs_trans_t		*tp,		/* transaction pointer */
 	struct xfs_inode	*ip,		/* incore inode */
 	xfs_fileoff_t		bno,		/* starting offset to unmap */
-	xfs_filblks_t		len,		/* length to unmap in file */
+	xfs_filblks_t		*rlen,		/* i/o: amount remaining */
 	int			flags,		/* misc flags */
 	xfs_extnum_t		nexts,		/* number of extents max */
 	xfs_fsblock_t		*firstblock,	/* first allocated block
 						   controls a.g. for allocs */
-	struct xfs_defer_ops	*dfops,		/* i/o: list extents to free */
-	int			*done)		/* set if not done yet */
+	struct xfs_defer_ops	*dfops)		/* i/o: deferred updates */
 {
 	xfs_btree_cur_t		*cur;		/* bmap btree cursor */
 	xfs_bmbt_irec_t		del;		/* extent being deleted */
@@ -5053,11 +5378,12 @@ xfs_bunmapi(
 	int			wasdel;		/* was a delayed alloc extent */
 	int			whichfork;	/* data or attribute fork */
 	xfs_fsblock_t		sum;
+	xfs_filblks_t		len = *rlen;	/* length to unmap in file */
 
 	trace_xfs_bunmap(ip, bno, len, flags, _RET_IP_);
 
-	whichfork = (flags & XFS_BMAPI_ATTRFORK) ?
-		XFS_ATTR_FORK : XFS_DATA_FORK;
+	whichfork = xfs_bmapi_whichfork(flags);
+	ASSERT(whichfork != XFS_COW_FORK);
 	ifp = XFS_IFORK_PTR(ip, whichfork);
 	if (unlikely(
 	    XFS_IFORK_FORMAT(ip, whichfork) != XFS_DINODE_FMT_EXTENTS &&
@@ -5079,7 +5405,7 @@ xfs_bunmapi(
 		return error;
 	nextents = ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t);
 	if (nextents == 0) {
-		*done = 1;
+		*rlen = 0;
 		return 0;
 	}
 	XFS_STATS_INC(mp, xs_blk_unmap);
@@ -5324,7 +5650,7 @@ xfs_bunmapi(
 			cur->bc_private.b.flags &= ~XFS_BTCUR_BPRV_WASDEL;
 
 		error = xfs_bmap_del_extent(ip, tp, &lastx, dfops, cur, &del,
-				&tmp_logflags, whichfork);
+				&tmp_logflags, whichfork, flags);
 		logflags |= tmp_logflags;
 		if (error)
 			goto error0;
@@ -5350,7 +5676,10 @@ nodelete:
 			extno++;
 		}
 	}
-	*done = bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0;
+	if (bno == (xfs_fileoff_t)-1 || bno < start || lastx < 0)
+		*rlen = 0;
+	else
+		*rlen = bno - start + 1;
 
 	/*
 	 * Convert to a btree if necessary.
@@ -5406,6 +5735,27 @@ error0:
 	return error;
 }
 
+/* Unmap a range of a file. */
+int
+xfs_bunmapi(
+	xfs_trans_t		*tp,
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		bno,
+	xfs_filblks_t		len,
+	int			flags,
+	xfs_extnum_t		nexts,
+	xfs_fsblock_t		*firstblock,
+	struct xfs_defer_ops	*dfops,
+	int			*done)
+{
+	int			error;
+
+	error = __xfs_bunmapi(tp, ip, bno, &len, flags, nexts, firstblock,
+			dfops);
+	*done = (len == 0);
+	return error;
+}
+
 /*
  * Determine whether an extent shift can be accomplished by a merge with the
  * extent that precedes the target hole of the shift.
@@ -5985,3 +6335,146 @@ out:
 	xfs_trans_cancel(tp);
 	return error;
 }
+
+/* Deferred mapping is only for real extents in the data fork. */
+static bool
+xfs_bmap_is_update_needed(
+	struct xfs_bmbt_irec	*bmap)
+{
+	return  bmap->br_startblock != HOLESTARTBLOCK &&
+		bmap->br_startblock != DELAYSTARTBLOCK;
+}
+
+/* Record a bmap intent. */
+static int
+__xfs_bmap_add(
+	struct xfs_mount		*mp,
+	struct xfs_defer_ops		*dfops,
+	enum xfs_bmap_intent_type	type,
+	struct xfs_inode		*ip,
+	int				whichfork,
+	struct xfs_bmbt_irec		*bmap)
+{
+	int				error;
+	struct xfs_bmap_intent		*bi;
+
+	trace_xfs_bmap_defer(mp,
+			XFS_FSB_TO_AGNO(mp, bmap->br_startblock),
+			type,
+			XFS_FSB_TO_AGBNO(mp, bmap->br_startblock),
+			ip->i_ino, whichfork,
+			bmap->br_startoff,
+			bmap->br_blockcount,
+			bmap->br_state);
+
+	bi = kmem_alloc(sizeof(struct xfs_bmap_intent), KM_SLEEP | KM_NOFS);
+	INIT_LIST_HEAD(&bi->bi_list);
+	bi->bi_type = type;
+	bi->bi_owner = ip;
+	bi->bi_whichfork = whichfork;
+	bi->bi_bmap = *bmap;
+
+	error = xfs_defer_join(dfops, bi->bi_owner);
+	if (error) {
+		kmem_free(bi);
+		return error;
+	}
+
+	xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_BMAP, &bi->bi_list);
+	return 0;
+}
+
+/* Map an extent into a file. */
+int
+xfs_bmap_map_extent(
+	struct xfs_mount	*mp,
+	struct xfs_defer_ops	*dfops,
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*PREV)
+{
+	if (!xfs_bmap_is_update_needed(PREV))
+		return 0;
+
+	return __xfs_bmap_add(mp, dfops, XFS_BMAP_MAP, ip,
+			XFS_DATA_FORK, PREV);
+}
+
+/* Unmap an extent out of a file. */
+int
+xfs_bmap_unmap_extent(
+	struct xfs_mount	*mp,
+	struct xfs_defer_ops	*dfops,
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*PREV)
+{
+	if (!xfs_bmap_is_update_needed(PREV))
+		return 0;
+
+	return __xfs_bmap_add(mp, dfops, XFS_BMAP_UNMAP, ip,
+			XFS_DATA_FORK, PREV);
+}
+
+/*
+ * Process one of the deferred bmap operations.  We pass back the
+ * btree cursor to maintain our lock on the bmapbt between calls.
+ */
+int
+xfs_bmap_finish_one(
+	struct xfs_trans		*tp,
+	struct xfs_defer_ops		*dfops,
+	struct xfs_inode		*ip,
+	enum xfs_bmap_intent_type	type,
+	int				whichfork,
+	xfs_fileoff_t			startoff,
+	xfs_fsblock_t			startblock,
+	xfs_filblks_t			blockcount,
+	xfs_exntst_t			state)
+{
+	struct xfs_bmbt_irec		bmap;
+	int				nimaps = 1;
+	xfs_fsblock_t			firstfsb;
+	int				flags = XFS_BMAPI_REMAP;
+	int				done;
+	int				error = 0;
+
+	bmap.br_startblock = startblock;
+	bmap.br_startoff = startoff;
+	bmap.br_blockcount = blockcount;
+	bmap.br_state = state;
+
+	trace_xfs_bmap_deferred(tp->t_mountp,
+			XFS_FSB_TO_AGNO(tp->t_mountp, startblock), type,
+			XFS_FSB_TO_AGBNO(tp->t_mountp, startblock),
+			ip->i_ino, whichfork, startoff, blockcount, state);
+
+	if (whichfork != XFS_DATA_FORK && whichfork != XFS_ATTR_FORK)
+		return -EFSCORRUPTED;
+	if (whichfork == XFS_ATTR_FORK)
+		flags |= XFS_BMAPI_ATTRFORK;
+
+	if (XFS_TEST_ERROR(false, tp->t_mountp,
+			XFS_ERRTAG_BMAP_FINISH_ONE,
+			XFS_RANDOM_BMAP_FINISH_ONE))
+		return -EIO;
+
+	switch (type) {
+	case XFS_BMAP_MAP:
+		firstfsb = bmap.br_startblock;
+		error = xfs_bmapi_write(tp, ip, bmap.br_startoff,
+					bmap.br_blockcount, flags, &firstfsb,
+					bmap.br_blockcount, &bmap, &nimaps,
+					dfops);
+		break;
+	case XFS_BMAP_UNMAP:
+		error = xfs_bunmapi(tp, ip, bmap.br_startoff,
+				bmap.br_blockcount, flags, 1, &firstfsb,
+				dfops, &done);
+		ASSERT(done);
+		break;
+	default:
+		ASSERT(0);
+		error = -EFSCORRUPTED;
+	}
+
+	return error;
+}
diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h
index 8395f6e8cf7d..f97db7132564 100644
--- a/fs/xfs/libxfs/xfs_bmap.h
+++ b/fs/xfs/libxfs/xfs_bmap.h
@@ -97,6 +97,19 @@ struct xfs_extent_free_item
  */
 #define XFS_BMAPI_ZERO		0x080
 
+/*
+ * Map the inode offset to the block given in ap->firstblock.  Primarily
+ * used for reflink.  The range must be in a hole, and this flag cannot be
+ * turned on with PREALLOC or CONVERT, and cannot be used on the attr fork.
+ *
+ * For bunmapi, this flag unmaps the range without adjusting quota, reducing
+ * refcount, or freeing the blocks.
+ */
+#define XFS_BMAPI_REMAP		0x100
+
+/* Map something in the CoW fork. */
+#define XFS_BMAPI_COWFORK	0x200
+
 #define XFS_BMAPI_FLAGS \
 	{ XFS_BMAPI_ENTIRE,	"ENTIRE" }, \
 	{ XFS_BMAPI_METADATA,	"METADATA" }, \
@@ -105,12 +118,24 @@ struct xfs_extent_free_item
 	{ XFS_BMAPI_IGSTATE,	"IGSTATE" }, \
 	{ XFS_BMAPI_CONTIG,	"CONTIG" }, \
 	{ XFS_BMAPI_CONVERT,	"CONVERT" }, \
-	{ XFS_BMAPI_ZERO,	"ZERO" }
+	{ XFS_BMAPI_ZERO,	"ZERO" }, \
+	{ XFS_BMAPI_REMAP,	"REMAP" }, \
+	{ XFS_BMAPI_COWFORK,	"COWFORK" }
 
 
 static inline int xfs_bmapi_aflag(int w)
 {
-	return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK : 0);
+	return (w == XFS_ATTR_FORK ? XFS_BMAPI_ATTRFORK :
+	       (w == XFS_COW_FORK ? XFS_BMAPI_COWFORK : 0));
+}
+
+static inline int xfs_bmapi_whichfork(int bmapi_flags)
+{
+	if (bmapi_flags & XFS_BMAPI_COWFORK)
+		return XFS_COW_FORK;
+	else if (bmapi_flags & XFS_BMAPI_ATTRFORK)
+		return XFS_ATTR_FORK;
+	return XFS_DATA_FORK;
 }
 
 /*
@@ -131,13 +156,15 @@ static inline int xfs_bmapi_aflag(int w)
 #define BMAP_LEFT_VALID		(1 << 6)
 #define BMAP_RIGHT_VALID	(1 << 7)
 #define BMAP_ATTRFORK		(1 << 8)
+#define BMAP_COWFORK		(1 << 9)
 
 #define XFS_BMAP_EXT_FLAGS \
 	{ BMAP_LEFT_CONTIG,	"LC" }, \
 	{ BMAP_RIGHT_CONTIG,	"RC" }, \
 	{ BMAP_LEFT_FILLING,	"LF" }, \
 	{ BMAP_RIGHT_FILLING,	"RF" }, \
-	{ BMAP_ATTRFORK,	"ATTR" }
+	{ BMAP_ATTRFORK,	"ATTR" }, \
+	{ BMAP_COWFORK,		"COW" }
 
 
 /*
@@ -186,10 +213,15 @@ int	xfs_bmapi_write(struct xfs_trans *tp, struct xfs_inode *ip,
 		xfs_fsblock_t *firstblock, xfs_extlen_t total,
 		struct xfs_bmbt_irec *mval, int *nmap,
 		struct xfs_defer_ops *dfops);
+int	__xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
+		xfs_fileoff_t bno, xfs_filblks_t *rlen, int flags,
+		xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
+		struct xfs_defer_ops *dfops);
 int	xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip,
 		xfs_fileoff_t bno, xfs_filblks_t len, int flags,
 		xfs_extnum_t nexts, xfs_fsblock_t *firstblock,
 		struct xfs_defer_ops *dfops, int *done);
+int	xfs_bunmapi_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *del);
 int	xfs_check_nostate_extents(struct xfs_ifork *ifp, xfs_extnum_t idx,
 		xfs_extnum_t num);
 uint	xfs_default_attroffset(struct xfs_inode *ip);
@@ -203,8 +235,31 @@ struct xfs_bmbt_rec_host *
 	xfs_bmap_search_extents(struct xfs_inode *ip, xfs_fileoff_t bno,
 		int fork, int *eofp, xfs_extnum_t *lastxp,
 		struct xfs_bmbt_irec *gotp, struct xfs_bmbt_irec *prevp);
-int	xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, xfs_fileoff_t aoff,
-		xfs_filblks_t len, struct xfs_bmbt_irec *got,
-		struct xfs_bmbt_irec *prev, xfs_extnum_t *lastx, int eof);
+int	xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork,
+		xfs_fileoff_t aoff, xfs_filblks_t len,
+		struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *prev,
+		xfs_extnum_t *lastx, int eof);
+
+enum xfs_bmap_intent_type {
+	XFS_BMAP_MAP = 1,
+	XFS_BMAP_UNMAP,
+};
+
+struct xfs_bmap_intent {
+	struct list_head			bi_list;
+	enum xfs_bmap_intent_type		bi_type;
+	struct xfs_inode			*bi_owner;
+	int					bi_whichfork;
+	struct xfs_bmbt_irec			bi_bmap;
+};
+
+int	xfs_bmap_finish_one(struct xfs_trans *tp, struct xfs_defer_ops *dfops,
+		struct xfs_inode *ip, enum xfs_bmap_intent_type type,
+		int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
+		xfs_filblks_t blockcount, xfs_exntst_t state);
+int	xfs_bmap_map_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+		struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
+int	xfs_bmap_unmap_extent(struct xfs_mount *mp, struct xfs_defer_ops *dfops,
+		struct xfs_inode *ip, struct xfs_bmbt_irec *imap);
 
 #endif	/* __XFS_BMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_bmap_btree.c b/fs/xfs/libxfs/xfs_bmap_btree.c
index cd85274e810c..8007d2ba9aef 100644
--- a/fs/xfs/libxfs/xfs_bmap_btree.c
+++ b/fs/xfs/libxfs/xfs_bmap_btree.c
@@ -453,6 +453,7 @@ xfs_bmbt_alloc_block(
 
 	if (args.fsbno == NULLFSBLOCK) {
 		args.fsbno = be64_to_cpu(start->l);
+try_another_ag:
 		args.type = XFS_ALLOCTYPE_START_BNO;
 		/*
 		 * Make sure there is sufficient room left in the AG to
@@ -482,6 +483,22 @@ xfs_bmbt_alloc_block(
 	if (error)
 		goto error0;
 
+	/*
+	 * During a CoW operation, the allocation and bmbt updates occur in
+	 * different transactions.  The mapping code tries to put new bmbt
+	 * blocks near extents being mapped, but the only way to guarantee this
+	 * is if the alloc and the mapping happen in a single transaction that
+	 * has a block reservation.  That isn't the case here, so if we run out
+	 * of space we'll try again with another AG.
+	 */
+	if (xfs_sb_version_hasreflink(&cur->bc_mp->m_sb) &&
+	    args.fsbno == NULLFSBLOCK &&
+	    args.type == XFS_ALLOCTYPE_NEAR_BNO) {
+		cur->bc_private.b.dfops->dop_low = true;
+		args.fsbno = cur->bc_private.b.firstblock;
+		goto try_another_ag;
+	}
+
 	if (args.fsbno == NULLFSBLOCK && args.minleft) {
 		/*
 		 * Could not find an AG with enough free space to satisfy
@@ -777,6 +794,7 @@ xfs_bmbt_init_cursor(
 {
 	struct xfs_ifork	*ifp = XFS_IFORK_PTR(ip, whichfork);
 	struct xfs_btree_cur	*cur;
+	ASSERT(whichfork != XFS_COW_FORK);
 
 	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_SLEEP);
 
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index aa1752f918b8..5c8e6f2ce44f 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -45,9 +45,10 @@ kmem_zone_t	*xfs_btree_cur_zone;
  */
 static const __uint32_t xfs_magics[2][XFS_BTNUM_MAX] = {
 	{ XFS_ABTB_MAGIC, XFS_ABTC_MAGIC, 0, XFS_BMAP_MAGIC, XFS_IBT_MAGIC,
-	  XFS_FIBT_MAGIC },
+	  XFS_FIBT_MAGIC, 0 },
 	{ XFS_ABTB_CRC_MAGIC, XFS_ABTC_CRC_MAGIC, XFS_RMAP_CRC_MAGIC,
-	  XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC }
+	  XFS_BMAP_CRC_MAGIC, XFS_IBT_CRC_MAGIC, XFS_FIBT_CRC_MAGIC,
+	  XFS_REFC_CRC_MAGIC }
 };
 #define xfs_btree_magic(cur) \
 	xfs_magics[!!((cur)->bc_flags & XFS_BTREE_CRC_BLOCKS)][cur->bc_btnum]
@@ -1216,6 +1217,9 @@ xfs_btree_set_refs(
 	case XFS_BTNUM_RMAP:
 		xfs_buf_set_ref(bp, XFS_RMAP_BTREE_REF);
 		break;
+	case XFS_BTNUM_REFC:
+		xfs_buf_set_ref(bp, XFS_REFC_BTREE_REF);
+		break;
 	default:
 		ASSERT(0);
 	}
diff --git a/fs/xfs/libxfs/xfs_btree.h b/fs/xfs/libxfs/xfs_btree.h
index 3f8556a5c2ad..c2b01d1c79ee 100644
--- a/fs/xfs/libxfs/xfs_btree.h
+++ b/fs/xfs/libxfs/xfs_btree.h
@@ -49,6 +49,7 @@ union xfs_btree_key {
 	struct xfs_inobt_key		inobt;
 	struct xfs_rmap_key		rmap;
 	struct xfs_rmap_key		__rmap_bigkey[2];
+	struct xfs_refcount_key		refc;
 };
 
 union xfs_btree_rec {
@@ -57,6 +58,7 @@ union xfs_btree_rec {
 	struct xfs_alloc_rec		alloc;
 	struct xfs_inobt_rec		inobt;
 	struct xfs_rmap_rec		rmap;
+	struct xfs_refcount_rec		refc;
 };
 
 /*
@@ -72,6 +74,7 @@ union xfs_btree_rec {
 #define	XFS_BTNUM_INO	((xfs_btnum_t)XFS_BTNUM_INOi)
 #define	XFS_BTNUM_FINO	((xfs_btnum_t)XFS_BTNUM_FINOi)
 #define	XFS_BTNUM_RMAP	((xfs_btnum_t)XFS_BTNUM_RMAPi)
+#define	XFS_BTNUM_REFC	((xfs_btnum_t)XFS_BTNUM_REFCi)
 
 /*
  * For logging record fields.
@@ -105,6 +108,7 @@ do {    \
 	case XFS_BTNUM_INO: __XFS_BTREE_STATS_INC(__mp, ibt, stat); break; \
 	case XFS_BTNUM_FINO: __XFS_BTREE_STATS_INC(__mp, fibt, stat); break; \
 	case XFS_BTNUM_RMAP: __XFS_BTREE_STATS_INC(__mp, rmap, stat); break; \
+	case XFS_BTNUM_REFC: __XFS_BTREE_STATS_INC(__mp, refcbt, stat); break; \
 	case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break;	\
 	}       \
 } while (0)
@@ -127,6 +131,8 @@ do {    \
 		__XFS_BTREE_STATS_ADD(__mp, fibt, stat, val); break; \
 	case XFS_BTNUM_RMAP:	\
 		__XFS_BTREE_STATS_ADD(__mp, rmap, stat, val); break; \
+	case XFS_BTNUM_REFC:	\
+		__XFS_BTREE_STATS_ADD(__mp, refcbt, stat, val); break; \
 	case XFS_BTNUM_MAX: ASSERT(0); /* fucking gcc */ ; break; \
 	}       \
 } while (0)
@@ -217,6 +223,15 @@ union xfs_btree_irec {
 	struct xfs_bmbt_irec		b;
 	struct xfs_inobt_rec_incore	i;
 	struct xfs_rmap_irec		r;
+	struct xfs_refcount_irec	rc;
+};
+
+/* Per-AG btree private information. */
+union xfs_btree_cur_private {
+	struct {
+		unsigned long	nr_ops;		/* # record updates */
+		int		shape_changes;	/* # of extent splits */
+	} refc;
 };
 
 /*
@@ -243,6 +258,7 @@ typedef struct xfs_btree_cur
 			struct xfs_buf	*agbp;	/* agf/agi buffer pointer */
 			struct xfs_defer_ops *dfops;	/* deferred updates */
 			xfs_agnumber_t	agno;	/* ag number */
+			union xfs_btree_cur_private	priv;
 		} a;
 		struct {			/* needed for BMAP */
 			struct xfs_inode *ip;	/* pointer to our inode */
diff --git a/fs/xfs/libxfs/xfs_defer.h b/fs/xfs/libxfs/xfs_defer.h
index e96533d178cf..f6e93ef0bffe 100644
--- a/fs/xfs/libxfs/xfs_defer.h
+++ b/fs/xfs/libxfs/xfs_defer.h
@@ -51,6 +51,8 @@ struct xfs_defer_pending {
  * find all the space it needs.
  */
 enum xfs_defer_ops_type {
+	XFS_DEFER_OPS_TYPE_BMAP,
+	XFS_DEFER_OPS_TYPE_REFCOUNT,
 	XFS_DEFER_OPS_TYPE_RMAP,
 	XFS_DEFER_OPS_TYPE_FREE,
 	XFS_DEFER_OPS_TYPE_MAX,
diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h
index 270fb5cf4fa1..f6547fc5e016 100644
--- a/fs/xfs/libxfs/xfs_format.h
+++ b/fs/xfs/libxfs/xfs_format.h
@@ -456,9 +456,11 @@ xfs_sb_has_compat_feature(
 
 #define XFS_SB_FEAT_RO_COMPAT_FINOBT   (1 << 0)		/* free inode btree */
 #define XFS_SB_FEAT_RO_COMPAT_RMAPBT   (1 << 1)		/* reverse map btree */
+#define XFS_SB_FEAT_RO_COMPAT_REFLINK  (1 << 2)		/* reflinked files */
 #define XFS_SB_FEAT_RO_COMPAT_ALL \
 		(XFS_SB_FEAT_RO_COMPAT_FINOBT | \
-		 XFS_SB_FEAT_RO_COMPAT_RMAPBT)
+		 XFS_SB_FEAT_RO_COMPAT_RMAPBT | \
+		 XFS_SB_FEAT_RO_COMPAT_REFLINK)
 #define XFS_SB_FEAT_RO_COMPAT_UNKNOWN	~XFS_SB_FEAT_RO_COMPAT_ALL
 static inline bool
 xfs_sb_has_ro_compat_feature(
@@ -546,6 +548,12 @@ static inline bool xfs_sb_version_hasrmapbt(struct xfs_sb *sbp)
 		(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_RMAPBT);
 }
 
+static inline bool xfs_sb_version_hasreflink(struct xfs_sb *sbp)
+{
+	return XFS_SB_VERSION_NUM(sbp) == XFS_SB_VERSION_5 &&
+		(sbp->sb_features_ro_compat & XFS_SB_FEAT_RO_COMPAT_REFLINK);
+}
+
 /*
  * end of superblock version macros
  */
@@ -641,14 +649,17 @@ typedef struct xfs_agf {
 	uuid_t		agf_uuid;	/* uuid of filesystem */
 
 	__be32		agf_rmap_blocks;	/* rmapbt blocks used */
-	__be32		agf_padding;		/* padding */
+	__be32		agf_refcount_blocks;	/* refcountbt blocks used */
+
+	__be32		agf_refcount_root;	/* refcount tree root block */
+	__be32		agf_refcount_level;	/* refcount btree levels */
 
 	/*
 	 * reserve some contiguous space for future logged fields before we add
 	 * the unlogged fields. This makes the range logging via flags and
 	 * structure offsets much simpler.
 	 */
-	__be64		agf_spare64[15];
+	__be64		agf_spare64[14];
 
 	/* unlogged fields, written during buffer writeback. */
 	__be64		agf_lsn;	/* last write sequence */
@@ -674,8 +685,11 @@ typedef struct xfs_agf {
 #define	XFS_AGF_BTREEBLKS	0x00000800
 #define	XFS_AGF_UUID		0x00001000
 #define	XFS_AGF_RMAP_BLOCKS	0x00002000
-#define	XFS_AGF_SPARE64		0x00004000
-#define	XFS_AGF_NUM_BITS	15
+#define	XFS_AGF_REFCOUNT_BLOCKS	0x00004000
+#define	XFS_AGF_REFCOUNT_ROOT	0x00008000
+#define	XFS_AGF_REFCOUNT_LEVEL	0x00010000
+#define	XFS_AGF_SPARE64		0x00020000
+#define	XFS_AGF_NUM_BITS	18
 #define	XFS_AGF_ALL_BITS	((1 << XFS_AGF_NUM_BITS) - 1)
 
 #define XFS_AGF_FLAGS \
@@ -693,6 +707,9 @@ typedef struct xfs_agf {
 	{ XFS_AGF_BTREEBLKS,	"BTREEBLKS" }, \
 	{ XFS_AGF_UUID,		"UUID" }, \
 	{ XFS_AGF_RMAP_BLOCKS,	"RMAP_BLOCKS" }, \
+	{ XFS_AGF_REFCOUNT_BLOCKS,	"REFCOUNT_BLOCKS" }, \
+	{ XFS_AGF_REFCOUNT_ROOT,	"REFCOUNT_ROOT" }, \
+	{ XFS_AGF_REFCOUNT_LEVEL,	"REFCOUNT_LEVEL" }, \
 	{ XFS_AGF_SPARE64,	"SPARE64" }
 
 /* disk block (xfs_daddr_t) in the AG */
@@ -885,7 +902,8 @@ typedef struct xfs_dinode {
 	__be64		di_changecount;	/* number of attribute changes */
 	__be64		di_lsn;		/* flush sequence */
 	__be64		di_flags2;	/* more random flags */
-	__u8		di_pad2[16];	/* more padding for future expansion */
+	__be32		di_cowextsize;	/* basic cow extent size for file */
+	__u8		di_pad2[12];	/* more padding for future expansion */
 
 	/* fields only written to during inode creation */
 	xfs_timestamp_t	di_crtime;	/* time created */
@@ -1041,9 +1059,14 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev)
  * 16 bits of the XFS_XFLAG_s range.
  */
 #define XFS_DIFLAG2_DAX_BIT	0	/* use DAX for this inode */
+#define XFS_DIFLAG2_REFLINK_BIT	1	/* file's blocks may be shared */
+#define XFS_DIFLAG2_COWEXTSIZE_BIT   2  /* copy on write extent size hint */
 #define XFS_DIFLAG2_DAX		(1 << XFS_DIFLAG2_DAX_BIT)
+#define XFS_DIFLAG2_REFLINK     (1 << XFS_DIFLAG2_REFLINK_BIT)
+#define XFS_DIFLAG2_COWEXTSIZE  (1 << XFS_DIFLAG2_COWEXTSIZE_BIT)
 
-#define XFS_DIFLAG2_ANY		(XFS_DIFLAG2_DAX)
+#define XFS_DIFLAG2_ANY \
+	(XFS_DIFLAG2_DAX | XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)
 
 /*
  * Inode number format:
@@ -1353,7 +1376,9 @@ struct xfs_owner_info {
 #define XFS_RMAP_OWN_AG		(-5ULL)	/* AG freespace btree blocks */
 #define XFS_RMAP_OWN_INOBT	(-6ULL)	/* Inode btree blocks */
 #define XFS_RMAP_OWN_INODES	(-7ULL)	/* Inode chunk */
-#define XFS_RMAP_OWN_MIN	(-8ULL) /* guard */
+#define XFS_RMAP_OWN_REFC	(-8ULL) /* refcount tree */
+#define XFS_RMAP_OWN_COW	(-9ULL) /* cow allocations */
+#define XFS_RMAP_OWN_MIN	(-10ULL) /* guard */
 
 #define XFS_RMAP_NON_INODE_OWNER(owner)	(!!((owner) & (1ULL << 63)))
 
@@ -1434,6 +1459,62 @@ typedef __be32 xfs_rmap_ptr_t;
 	 XFS_IBT_BLOCK(mp) + 1)
 
 /*
+ * Reference Count Btree format definitions
+ *
+ */
+#define	XFS_REFC_CRC_MAGIC	0x52334643	/* 'R3FC' */
+
+unsigned int xfs_refc_block(struct xfs_mount *mp);
+
+/*
+ * Data record/key structure
+ *
+ * Each record associates a range of physical blocks (starting at
+ * rc_startblock and ending rc_blockcount blocks later) with a reference
+ * count (rc_refcount).  Extents that are being used to stage a copy on
+ * write (CoW) operation are recorded in the refcount btree with a
+ * refcount of 1.  All other records must have a refcount > 1 and must
+ * track an extent mapped only by file data forks.
+ *
+ * Extents with a single owner (attributes, metadata, non-shared file
+ * data) are not tracked here.  Free space is also not tracked here.
+ * This is consistent with pre-reflink XFS.
+ */
+
+/*
+ * Extents that are being used to stage a copy on write are stored
+ * in the refcount btree with a refcount of 1 and the upper bit set
+ * on the startblock.  This speeds up mount time deletion of stale
+ * staging extents because they're all at the right side of the tree.
+ */
+#define XFS_REFC_COW_START		((xfs_agblock_t)(1U << 31))
+#define REFCNTBT_COWFLAG_BITLEN		1
+#define REFCNTBT_AGBLOCK_BITLEN		31
+
+struct xfs_refcount_rec {
+	__be32		rc_startblock;	/* starting block number */
+	__be32		rc_blockcount;	/* count of blocks */
+	__be32		rc_refcount;	/* number of inodes linked here */
+};
+
+struct xfs_refcount_key {
+	__be32		rc_startblock;	/* starting block number */
+};
+
+struct xfs_refcount_irec {
+	xfs_agblock_t	rc_startblock;	/* starting block number */
+	xfs_extlen_t	rc_blockcount;	/* count of free blocks */
+	xfs_nlink_t	rc_refcount;	/* number of inodes linked here */
+};
+
+#define MAXREFCOUNT	((xfs_nlink_t)~0U)
+#define MAXREFCEXTLEN	((xfs_extlen_t)~0U)
+
+/* btree pointer type */
+typedef __be32 xfs_refcount_ptr_t;
+
+
+/*
  * BMAP Btree format definitions
  *
  * This includes both the root block definition that sits inside an inode fork
diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h
index 79455058b752..b72dc821d78b 100644
--- a/fs/xfs/libxfs/xfs_fs.h
+++ b/fs/xfs/libxfs/xfs_fs.h
@@ -81,14 +81,16 @@ struct getbmapx {
 #define BMV_IF_PREALLOC		0x4	/* rtn status BMV_OF_PREALLOC if req */
 #define BMV_IF_DELALLOC		0x8	/* rtn status BMV_OF_DELALLOC if req */
 #define BMV_IF_NO_HOLES		0x10	/* Do not return holes */
+#define BMV_IF_COWFORK		0x20	/* return CoW fork rather than data */
 #define BMV_IF_VALID	\
 	(BMV_IF_ATTRFORK|BMV_IF_NO_DMAPI_READ|BMV_IF_PREALLOC|	\
-	 BMV_IF_DELALLOC|BMV_IF_NO_HOLES)
+	 BMV_IF_DELALLOC|BMV_IF_NO_HOLES|BMV_IF_COWFORK)
 
 /*	bmv_oflags values - returned for each non-header segment */
 #define BMV_OF_PREALLOC		0x1	/* segment = unwritten pre-allocation */
 #define BMV_OF_DELALLOC		0x2	/* segment = delayed allocation */
 #define BMV_OF_LAST		0x4	/* segment is the last in the file */
+#define BMV_OF_SHARED		0x8	/* segment shared with another file */
 
 /*
  * Structure for XFS_IOC_FSSETDM.
@@ -206,7 +208,8 @@ typedef struct xfs_fsop_resblks {
 #define XFS_FSOP_GEOM_FLAGS_FTYPE	0x10000	/* inode directory types */
 #define XFS_FSOP_GEOM_FLAGS_FINOBT	0x20000	/* free inode btree */
 #define XFS_FSOP_GEOM_FLAGS_SPINODES	0x40000	/* sparse inode chunks	*/
-#define XFS_FSOP_GEOM_FLAGS_RMAPBT	0x80000	/* Reverse mapping btree */
+#define XFS_FSOP_GEOM_FLAGS_RMAPBT	0x80000	/* reverse mapping btree */
+#define XFS_FSOP_GEOM_FLAGS_REFLINK	0x100000 /* files can share blocks */
 
 /*
  * Minimum and maximum sizes need for growth checks.
@@ -275,7 +278,8 @@ typedef struct xfs_bstat {
 #define	bs_projid	bs_projid_lo	/* (previously just bs_projid)	*/
 	__u16		bs_forkoff;	/* inode fork offset in bytes	*/
 	__u16		bs_projid_hi;	/* higher part of project id	*/
-	unsigned char	bs_pad[10];	/* pad space, unused		*/
+	unsigned char	bs_pad[6];	/* pad space, unused		*/
+	__u32		bs_cowextsize;	/* cow extent size		*/
 	__u32		bs_dmevmask;	/* DMIG event mask		*/
 	__u16		bs_dmstate;	/* DMIG state info		*/
 	__u16		bs_aextents;	/* attribute number of extents	*/
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index 4b9769e23c83..8de9a3a29589 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -256,6 +256,7 @@ xfs_inode_from_disk(
 		to->di_crtime.t_sec = be32_to_cpu(from->di_crtime.t_sec);
 		to->di_crtime.t_nsec = be32_to_cpu(from->di_crtime.t_nsec);
 		to->di_flags2 = be64_to_cpu(from->di_flags2);
+		to->di_cowextsize = be32_to_cpu(from->di_cowextsize);
 	}
 }
 
@@ -305,7 +306,7 @@ xfs_inode_to_disk(
 		to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
 		to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
 		to->di_flags2 = cpu_to_be64(from->di_flags2);
-
+		to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
 		to->di_ino = cpu_to_be64(ip->i_ino);
 		to->di_lsn = cpu_to_be64(lsn);
 		memset(to->di_pad2, 0, sizeof(to->di_pad2));
@@ -357,6 +358,7 @@ xfs_log_dinode_to_disk(
 		to->di_crtime.t_sec = cpu_to_be32(from->di_crtime.t_sec);
 		to->di_crtime.t_nsec = cpu_to_be32(from->di_crtime.t_nsec);
 		to->di_flags2 = cpu_to_be64(from->di_flags2);
+		to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
 		to->di_ino = cpu_to_be64(from->di_ino);
 		to->di_lsn = cpu_to_be64(from->di_lsn);
 		memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
@@ -373,6 +375,9 @@ xfs_dinode_verify(
 	struct xfs_inode	*ip,
 	struct xfs_dinode	*dip)
 {
+	uint16_t		flags;
+	uint64_t		flags2;
+
 	if (dip->di_magic != cpu_to_be16(XFS_DINODE_MAGIC))
 		return false;
 
@@ -389,6 +394,23 @@ xfs_dinode_verify(
 		return false;
 	if (!uuid_equal(&dip->di_uuid, &mp->m_sb.sb_meta_uuid))
 		return false;
+
+	flags = be16_to_cpu(dip->di_flags);
+	flags2 = be64_to_cpu(dip->di_flags2);
+
+	/* don't allow reflink/cowextsize if we don't have reflink */
+	if ((flags2 & (XFS_DIFLAG2_REFLINK | XFS_DIFLAG2_COWEXTSIZE)) &&
+            !xfs_sb_version_hasreflink(&mp->m_sb))
+		return false;
+
+	/* don't let reflink and realtime mix */
+	if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags & XFS_DIFLAG_REALTIME))
+		return false;
+
+	/* don't let reflink and dax mix */
+	if ((flags2 & XFS_DIFLAG2_REFLINK) && (flags2 & XFS_DIFLAG2_DAX))
+		return false;
+
 	return true;
 }
 
diff --git a/fs/xfs/libxfs/xfs_inode_buf.h b/fs/xfs/libxfs/xfs_inode_buf.h
index 7c4dd321b215..62d9d4681c8c 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.h
+++ b/fs/xfs/libxfs/xfs_inode_buf.h
@@ -47,6 +47,7 @@ struct xfs_icdinode {
 	__uint16_t	di_flags;	/* random flags, XFS_DIFLAG_... */
 
 	__uint64_t	di_flags2;	/* more random flags */
+	__uint32_t	di_cowextsize;	/* basic cow extent size for file */
 
 	xfs_ictimestamp_t di_crtime;	/* time created */
 };
diff --git a/fs/xfs/libxfs/xfs_inode_fork.c b/fs/xfs/libxfs/xfs_inode_fork.c
index bbcc8c7a44b3..5dd56d3dbb3a 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.c
+++ b/fs/xfs/libxfs/xfs_inode_fork.c
@@ -121,6 +121,26 @@ xfs_iformat_fork(
 		return -EFSCORRUPTED;
 	}
 
+	if (unlikely(xfs_is_reflink_inode(ip) &&
+	    (VFS_I(ip)->i_mode & S_IFMT) != S_IFREG)) {
+		xfs_warn(ip->i_mount,
+			"corrupt dinode %llu, wrong file type for reflink.",
+			ip->i_ino);
+		XFS_CORRUPTION_ERROR("xfs_iformat(reflink)",
+				     XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+		return -EFSCORRUPTED;
+	}
+
+	if (unlikely(xfs_is_reflink_inode(ip) &&
+	    (ip->i_d.di_flags & XFS_DIFLAG_REALTIME))) {
+		xfs_warn(ip->i_mount,
+			"corrupt dinode %llu, has reflink+realtime flag set.",
+			ip->i_ino);
+		XFS_CORRUPTION_ERROR("xfs_iformat(reflink)",
+				     XFS_ERRLEVEL_LOW, ip->i_mount, dip);
+		return -EFSCORRUPTED;
+	}
+
 	switch (VFS_I(ip)->i_mode & S_IFMT) {
 	case S_IFIFO:
 	case S_IFCHR:
@@ -186,9 +206,14 @@ xfs_iformat_fork(
 		XFS_ERROR_REPORT("xfs_iformat(7)", XFS_ERRLEVEL_LOW, ip->i_mount);
 		return -EFSCORRUPTED;
 	}
-	if (error) {
+	if (error)
 		return error;
+
+	if (xfs_is_reflink_inode(ip)) {
+		ASSERT(ip->i_cowfp == NULL);
+		xfs_ifork_init_cow(ip);
 	}
+
 	if (!XFS_DFORK_Q(dip))
 		return 0;
 
@@ -208,7 +233,8 @@ xfs_iformat_fork(
 			XFS_CORRUPTION_ERROR("xfs_iformat(8)",
 					     XFS_ERRLEVEL_LOW,
 					     ip->i_mount, dip);
-			return -EFSCORRUPTED;
+			error = -EFSCORRUPTED;
+			break;
 		}
 
 		error = xfs_iformat_local(ip, dip, XFS_ATTR_FORK, size);
@@ -226,6 +252,9 @@ xfs_iformat_fork(
 	if (error) {
 		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
 		ip->i_afp = NULL;
+		if (ip->i_cowfp)
+			kmem_zone_free(xfs_ifork_zone, ip->i_cowfp);
+		ip->i_cowfp = NULL;
 		xfs_idestroy_fork(ip, XFS_DATA_FORK);
 	}
 	return error;
@@ -740,6 +769,9 @@ xfs_idestroy_fork(
 	if (whichfork == XFS_ATTR_FORK) {
 		kmem_zone_free(xfs_ifork_zone, ip->i_afp);
 		ip->i_afp = NULL;
+	} else if (whichfork == XFS_COW_FORK) {
+		kmem_zone_free(xfs_ifork_zone, ip->i_cowfp);
+		ip->i_cowfp = NULL;
 	}
 }
 
@@ -927,6 +959,19 @@ xfs_iext_get_ext(
 	}
 }
 
+/* Convert bmap state flags to an inode fork. */
+struct xfs_ifork *
+xfs_iext_state_to_fork(
+	struct xfs_inode	*ip,
+	int			state)
+{
+	if (state & BMAP_COWFORK)
+		return ip->i_cowfp;
+	else if (state & BMAP_ATTRFORK)
+		return ip->i_afp;
+	return &ip->i_df;
+}
+
 /*
  * Insert new item(s) into the extent records for incore inode
  * fork 'ifp'.  'count' new items are inserted at index 'idx'.
@@ -939,7 +984,7 @@ xfs_iext_insert(
 	xfs_bmbt_irec_t	*new,		/* items to insert */
 	int		state)		/* type of extent conversion */
 {
-	xfs_ifork_t	*ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+	xfs_ifork_t	*ifp = xfs_iext_state_to_fork(ip, state);
 	xfs_extnum_t	i;		/* extent record index */
 
 	trace_xfs_iext_insert(ip, idx, new, state, _RET_IP_);
@@ -1189,7 +1234,7 @@ xfs_iext_remove(
 	int		ext_diff,	/* number of extents to remove */
 	int		state)		/* type of extent conversion */
 {
-	xfs_ifork_t	*ifp = (state & BMAP_ATTRFORK) ? ip->i_afp : &ip->i_df;
+	xfs_ifork_t	*ifp = xfs_iext_state_to_fork(ip, state);
 	xfs_extnum_t	nextents;	/* number of extents in file */
 	int		new_size;	/* size of extents after removal */
 
@@ -1934,3 +1979,20 @@ xfs_iext_irec_update_extoffs(
 		ifp->if_u1.if_ext_irec[i].er_extoff += ext_diff;
 	}
 }
+
+/*
+ * Initialize an inode's copy-on-write fork.
+ */
+void
+xfs_ifork_init_cow(
+	struct xfs_inode	*ip)
+{
+	if (ip->i_cowfp)
+		return;
+
+	ip->i_cowfp = kmem_zone_zalloc(xfs_ifork_zone,
+				       KM_SLEEP | KM_NOFS);
+	ip->i_cowfp->if_flags = XFS_IFEXTENTS;
+	ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
+	ip->i_cnextents = 0;
+}
diff --git a/fs/xfs/libxfs/xfs_inode_fork.h b/fs/xfs/libxfs/xfs_inode_fork.h
index f95e072ae646..c9476f50e32d 100644
--- a/fs/xfs/libxfs/xfs_inode_fork.h
+++ b/fs/xfs/libxfs/xfs_inode_fork.h
@@ -92,7 +92,9 @@ typedef struct xfs_ifork {
 #define XFS_IFORK_PTR(ip,w)		\
 	((w) == XFS_DATA_FORK ? \
 		&(ip)->i_df : \
-		(ip)->i_afp)
+		((w) == XFS_ATTR_FORK ? \
+			(ip)->i_afp : \
+			(ip)->i_cowfp))
 #define XFS_IFORK_DSIZE(ip) \
 	(XFS_IFORK_Q(ip) ? \
 		XFS_IFORK_BOFF(ip) : \
@@ -105,26 +107,38 @@ typedef struct xfs_ifork {
 #define XFS_IFORK_SIZE(ip,w) \
 	((w) == XFS_DATA_FORK ? \
 		XFS_IFORK_DSIZE(ip) : \
-		XFS_IFORK_ASIZE(ip))
+		((w) == XFS_ATTR_FORK ? \
+			XFS_IFORK_ASIZE(ip) : \
+			0))
 #define XFS_IFORK_FORMAT(ip,w) \
 	((w) == XFS_DATA_FORK ? \
 		(ip)->i_d.di_format : \
-		(ip)->i_d.di_aformat)
+		((w) == XFS_ATTR_FORK ? \
+			(ip)->i_d.di_aformat : \
+			(ip)->i_cformat))
 #define XFS_IFORK_FMT_SET(ip,w,n) \
 	((w) == XFS_DATA_FORK ? \
 		((ip)->i_d.di_format = (n)) : \
-		((ip)->i_d.di_aformat = (n)))
+		((w) == XFS_ATTR_FORK ? \
+			((ip)->i_d.di_aformat = (n)) : \
+			((ip)->i_cformat = (n))))
 #define XFS_IFORK_NEXTENTS(ip,w) \
 	((w) == XFS_DATA_FORK ? \
 		(ip)->i_d.di_nextents : \
-		(ip)->i_d.di_anextents)
+		((w) == XFS_ATTR_FORK ? \
+			(ip)->i_d.di_anextents : \
+			(ip)->i_cnextents))
 #define XFS_IFORK_NEXT_SET(ip,w,n) \
 	((w) == XFS_DATA_FORK ? \
 		((ip)->i_d.di_nextents = (n)) : \
-		((ip)->i_d.di_anextents = (n)))
+		((w) == XFS_ATTR_FORK ? \
+			((ip)->i_d.di_anextents = (n)) : \
+			((ip)->i_cnextents = (n))))
 #define XFS_IFORK_MAXEXT(ip, w) \
 	(XFS_IFORK_SIZE(ip, w) / sizeof(xfs_bmbt_rec_t))
 
+struct xfs_ifork *xfs_iext_state_to_fork(struct xfs_inode *ip, int state);
+
 int		xfs_iformat_fork(struct xfs_inode *, struct xfs_dinode *);
 void		xfs_iflush_fork(struct xfs_inode *, struct xfs_dinode *,
 				struct xfs_inode_log_item *, int);
@@ -169,4 +183,6 @@ void		xfs_iext_irec_update_extoffs(struct xfs_ifork *, int, int);
 
 extern struct kmem_zone	*xfs_ifork_zone;
 
+extern void xfs_ifork_init_cow(struct xfs_inode *ip);
+
 #endif	/* __XFS_INODE_FORK_H__ */
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index fc5eef85d61e..083cdd6d6c28 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -112,7 +112,11 @@ static inline uint xlog_get_cycle(char *ptr)
 #define XLOG_REG_TYPE_ICREATE		20
 #define XLOG_REG_TYPE_RUI_FORMAT	21
 #define XLOG_REG_TYPE_RUD_FORMAT	22
-#define XLOG_REG_TYPE_MAX		22
+#define XLOG_REG_TYPE_CUI_FORMAT	23
+#define XLOG_REG_TYPE_CUD_FORMAT	24
+#define XLOG_REG_TYPE_BUI_FORMAT	25
+#define XLOG_REG_TYPE_BUD_FORMAT	26
+#define XLOG_REG_TYPE_MAX		26
 
 /*
  * Flags to log operation header
@@ -231,6 +235,10 @@ typedef struct xfs_trans_header {
 #define	XFS_LI_ICREATE		0x123f
 #define	XFS_LI_RUI		0x1240	/* rmap update intent */
 #define	XFS_LI_RUD		0x1241
+#define	XFS_LI_CUI		0x1242	/* refcount update intent */
+#define	XFS_LI_CUD		0x1243
+#define	XFS_LI_BUI		0x1244	/* bmbt update intent */
+#define	XFS_LI_BUD		0x1245
 
 #define XFS_LI_TYPE_DESC \
 	{ XFS_LI_EFI,		"XFS_LI_EFI" }, \
@@ -242,7 +250,11 @@ typedef struct xfs_trans_header {
 	{ XFS_LI_QUOTAOFF,	"XFS_LI_QUOTAOFF" }, \
 	{ XFS_LI_ICREATE,	"XFS_LI_ICREATE" }, \
 	{ XFS_LI_RUI,		"XFS_LI_RUI" }, \
-	{ XFS_LI_RUD,		"XFS_LI_RUD" }
+	{ XFS_LI_RUD,		"XFS_LI_RUD" }, \
+	{ XFS_LI_CUI,		"XFS_LI_CUI" }, \
+	{ XFS_LI_CUD,		"XFS_LI_CUD" }, \
+	{ XFS_LI_BUI,		"XFS_LI_BUI" }, \
+	{ XFS_LI_BUD,		"XFS_LI_BUD" }
 
 /*
  * Inode Log Item Format definitions.
@@ -411,7 +423,8 @@ struct xfs_log_dinode {
 	__uint64_t	di_changecount;	/* number of attribute changes */
 	xfs_lsn_t	di_lsn;		/* flush sequence */
 	__uint64_t	di_flags2;	/* more random flags */
-	__uint8_t	di_pad2[16];	/* more padding for future expansion */
+	__uint32_t	di_cowextsize;	/* basic cow extent size for file */
+	__uint8_t	di_pad2[12];	/* more padding for future expansion */
 
 	/* fields only written to during inode creation */
 	xfs_ictimestamp_t di_crtime;	/* time created */
@@ -622,8 +635,11 @@ struct xfs_map_extent {
 
 /* rmap me_flags: upper bits are flags, lower byte is type code */
 #define XFS_RMAP_EXTENT_MAP		1
+#define XFS_RMAP_EXTENT_MAP_SHARED	2
 #define XFS_RMAP_EXTENT_UNMAP		3
+#define XFS_RMAP_EXTENT_UNMAP_SHARED	4
 #define XFS_RMAP_EXTENT_CONVERT		5
+#define XFS_RMAP_EXTENT_CONVERT_SHARED	6
 #define XFS_RMAP_EXTENT_ALLOC		7
 #define XFS_RMAP_EXTENT_FREE		8
 #define XFS_RMAP_EXTENT_TYPE_MASK	0xFF
@@ -671,6 +687,102 @@ struct xfs_rud_log_format {
 };
 
 /*
+ * CUI/CUD (refcount update) log format definitions
+ */
+struct xfs_phys_extent {
+	__uint64_t		pe_startblock;
+	__uint32_t		pe_len;
+	__uint32_t		pe_flags;
+};
+
+/* refcount pe_flags: upper bits are flags, lower byte is type code */
+/* Type codes are taken directly from enum xfs_refcount_intent_type. */
+#define XFS_REFCOUNT_EXTENT_TYPE_MASK	0xFF
+
+#define XFS_REFCOUNT_EXTENT_FLAGS	(XFS_REFCOUNT_EXTENT_TYPE_MASK)
+
+/*
+ * This is the structure used to lay out a cui log item in the
+ * log.  The cui_extents field is a variable size array whose
+ * size is given by cui_nextents.
+ */
+struct xfs_cui_log_format {
+	__uint16_t		cui_type;	/* cui log item type */
+	__uint16_t		cui_size;	/* size of this item */
+	__uint32_t		cui_nextents;	/* # extents to free */
+	__uint64_t		cui_id;		/* cui identifier */
+	struct xfs_phys_extent	cui_extents[];	/* array of extents */
+};
+
+static inline size_t
+xfs_cui_log_format_sizeof(
+	unsigned int		nr)
+{
+	return sizeof(struct xfs_cui_log_format) +
+			nr * sizeof(struct xfs_phys_extent);
+}
+
+/*
+ * This is the structure used to lay out a cud log item in the
+ * log.  The cud_extents array is a variable size array whose
+ * size is given by cud_nextents;
+ */
+struct xfs_cud_log_format {
+	__uint16_t		cud_type;	/* cud log item type */
+	__uint16_t		cud_size;	/* size of this item */
+	__uint32_t		__pad;
+	__uint64_t		cud_cui_id;	/* id of corresponding cui */
+};
+
+/*
+ * BUI/BUD (inode block mapping) log format definitions
+ */
+
+/* bmbt me_flags: upper bits are flags, lower byte is type code */
+/* Type codes are taken directly from enum xfs_bmap_intent_type. */
+#define XFS_BMAP_EXTENT_TYPE_MASK	0xFF
+
+#define XFS_BMAP_EXTENT_ATTR_FORK	(1U << 31)
+#define XFS_BMAP_EXTENT_UNWRITTEN	(1U << 30)
+
+#define XFS_BMAP_EXTENT_FLAGS		(XFS_BMAP_EXTENT_TYPE_MASK | \
+					 XFS_BMAP_EXTENT_ATTR_FORK | \
+					 XFS_BMAP_EXTENT_UNWRITTEN)
+
+/*
+ * This is the structure used to lay out an bui log item in the
+ * log.  The bui_extents field is a variable size array whose
+ * size is given by bui_nextents.
+ */
+struct xfs_bui_log_format {
+	__uint16_t		bui_type;	/* bui log item type */
+	__uint16_t		bui_size;	/* size of this item */
+	__uint32_t		bui_nextents;	/* # extents to free */
+	__uint64_t		bui_id;		/* bui identifier */
+	struct xfs_map_extent	bui_extents[];	/* array of extents to bmap */
+};
+
+static inline size_t
+xfs_bui_log_format_sizeof(
+	unsigned int		nr)
+{
+	return sizeof(struct xfs_bui_log_format) +
+			nr * sizeof(struct xfs_map_extent);
+}
+
+/*
+ * This is the structure used to lay out an bud log item in the
+ * log.  The bud_extents array is a variable size array whose
+ * size is given by bud_nextents;
+ */
+struct xfs_bud_log_format {
+	__uint16_t		bud_type;	/* bud log item type */
+	__uint16_t		bud_size;	/* size of this item */
+	__uint32_t		__pad;
+	__uint64_t		bud_bui_id;	/* id of corresponding bui */
+};
+
+/*
  * Dquot Log format definitions.
  *
  * The first two fields must be the type and size fitting into
diff --git a/fs/xfs/libxfs/xfs_refcount.c b/fs/xfs/libxfs/xfs_refcount.c
new file mode 100644
index 000000000000..b177ef33cd4c
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount.c
@@ -0,0 +1,1698 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_bit.h"
+#include "xfs_refcount.h"
+#include "xfs_rmap.h"
+
+/* Allowable refcount adjustment amounts. */
+enum xfs_refc_adjust_op {
+	XFS_REFCOUNT_ADJUST_INCREASE	= 1,
+	XFS_REFCOUNT_ADJUST_DECREASE	= -1,
+	XFS_REFCOUNT_ADJUST_COW_ALLOC	= 0,
+	XFS_REFCOUNT_ADJUST_COW_FREE	= -1,
+};
+
+STATIC int __xfs_refcount_cow_alloc(struct xfs_btree_cur *rcur,
+		xfs_agblock_t agbno, xfs_extlen_t aglen,
+		struct xfs_defer_ops *dfops);
+STATIC int __xfs_refcount_cow_free(struct xfs_btree_cur *rcur,
+		xfs_agblock_t agbno, xfs_extlen_t aglen,
+		struct xfs_defer_ops *dfops);
+
+/*
+ * Look up the first record less than or equal to [bno, len] in the btree
+ * given by cur.
+ */
+int
+xfs_refcount_lookup_le(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	int			*stat)
+{
+	trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno,
+			XFS_LOOKUP_LE);
+	cur->bc_rec.rc.rc_startblock = bno;
+	cur->bc_rec.rc.rc_blockcount = 0;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
+}
+
+/*
+ * Look up the first record greater than or equal to [bno, len] in the btree
+ * given by cur.
+ */
+int
+xfs_refcount_lookup_ge(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	int			*stat)
+{
+	trace_xfs_refcount_lookup(cur->bc_mp, cur->bc_private.a.agno, bno,
+			XFS_LOOKUP_GE);
+	cur->bc_rec.rc.rc_startblock = bno;
+	cur->bc_rec.rc.rc_blockcount = 0;
+	return xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
+}
+
+/* Convert on-disk record to in-core format. */
+static inline void
+xfs_refcount_btrec_to_irec(
+	union xfs_btree_rec		*rec,
+	struct xfs_refcount_irec	*irec)
+{
+	irec->rc_startblock = be32_to_cpu(rec->refc.rc_startblock);
+	irec->rc_blockcount = be32_to_cpu(rec->refc.rc_blockcount);
+	irec->rc_refcount = be32_to_cpu(rec->refc.rc_refcount);
+}
+
+/*
+ * Get the data from the pointed-to record.
+ */
+int
+xfs_refcount_get_rec(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*irec,
+	int				*stat)
+{
+	union xfs_btree_rec		*rec;
+	int				error;
+
+	error = xfs_btree_get_rec(cur, &rec, stat);
+	if (!error && *stat == 1) {
+		xfs_refcount_btrec_to_irec(rec, irec);
+		trace_xfs_refcount_get(cur->bc_mp, cur->bc_private.a.agno,
+				irec);
+	}
+	return error;
+}
+
+/*
+ * Update the record referred to by cur to the value given
+ * by [bno, len, refcount].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_refcount_update(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*irec)
+{
+	union xfs_btree_rec	rec;
+	int			error;
+
+	trace_xfs_refcount_update(cur->bc_mp, cur->bc_private.a.agno, irec);
+	rec.refc.rc_startblock = cpu_to_be32(irec->rc_startblock);
+	rec.refc.rc_blockcount = cpu_to_be32(irec->rc_blockcount);
+	rec.refc.rc_refcount = cpu_to_be32(irec->rc_refcount);
+	error = xfs_btree_update(cur, &rec);
+	if (error)
+		trace_xfs_refcount_update_error(cur->bc_mp,
+				cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Insert the record referred to by cur to the value given
+ * by [bno, len, refcount].
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_refcount_insert(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*irec,
+	int				*i)
+{
+	int				error;
+
+	trace_xfs_refcount_insert(cur->bc_mp, cur->bc_private.a.agno, irec);
+	cur->bc_rec.rc.rc_startblock = irec->rc_startblock;
+	cur->bc_rec.rc.rc_blockcount = irec->rc_blockcount;
+	cur->bc_rec.rc.rc_refcount = irec->rc_refcount;
+	error = xfs_btree_insert(cur, i);
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error);
+out_error:
+	if (error)
+		trace_xfs_refcount_insert_error(cur->bc_mp,
+				cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Remove the record referred to by cur, then set the pointer to the spot
+ * where the record could be re-inserted, in case we want to increment or
+ * decrement the cursor.
+ * This either works (return 0) or gets an EFSCORRUPTED error.
+ */
+STATIC int
+xfs_refcount_delete(
+	struct xfs_btree_cur	*cur,
+	int			*i)
+{
+	struct xfs_refcount_irec	irec;
+	int			found_rec;
+	int			error;
+
+	error = xfs_refcount_get_rec(cur, &irec, &found_rec);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+	trace_xfs_refcount_delete(cur->bc_mp, cur->bc_private.a.agno, &irec);
+	error = xfs_btree_delete(cur, i);
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, *i == 1, out_error);
+	if (error)
+		goto out_error;
+	error = xfs_refcount_lookup_ge(cur, irec.rc_startblock, &found_rec);
+out_error:
+	if (error)
+		trace_xfs_refcount_delete_error(cur->bc_mp,
+				cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Adjusting the Reference Count
+ *
+ * As stated elsewhere, the reference count btree (refcbt) stores
+ * >1 reference counts for extents of physical blocks.  In this
+ * operation, we're either raising or lowering the reference count of
+ * some subrange stored in the tree:
+ *
+ *      <------ adjustment range ------>
+ * ----+   +---+-----+ +--+--------+---------
+ *  2  |   | 3 |  4  | |17|   55   |   10
+ * ----+   +---+-----+ +--+--------+---------
+ * X axis is physical blocks number;
+ * reference counts are the numbers inside the rectangles
+ *
+ * The first thing we need to do is to ensure that there are no
+ * refcount extents crossing either boundary of the range to be
+ * adjusted.  For any extent that does cross a boundary, split it into
+ * two extents so that we can increment the refcount of one of the
+ * pieces later:
+ *
+ *      <------ adjustment range ------>
+ * ----+   +---+-----+ +--+--------+----+----
+ *  2  |   | 3 |  2  | |17|   55   | 10 | 10
+ * ----+   +---+-----+ +--+--------+----+----
+ *
+ * For this next step, let's assume that all the physical blocks in
+ * the adjustment range are mapped to a file and are therefore in use
+ * at least once.  Therefore, we can infer that any gap in the
+ * refcount tree within the adjustment range represents a physical
+ * extent with refcount == 1:
+ *
+ *      <------ adjustment range ------>
+ * ----+---+---+-----+-+--+--------+----+----
+ *  2  |"1"| 3 |  2  |1|17|   55   | 10 | 10
+ * ----+---+---+-----+-+--+--------+----+----
+ *      ^
+ *
+ * For each extent that falls within the interval range, figure out
+ * which extent is to the left or the right of that extent.  Now we
+ * have a left, current, and right extent.  If the new reference count
+ * of the center extent enables us to merge left, center, and right
+ * into one record covering all three, do so.  If the center extent is
+ * at the left end of the range, abuts the left extent, and its new
+ * reference count matches the left extent's record, then merge them.
+ * If the center extent is at the right end of the range, abuts the
+ * right extent, and the reference counts match, merge those.  In the
+ * example, we can left merge (assuming an increment operation):
+ *
+ *      <------ adjustment range ------>
+ * --------+---+-----+-+--+--------+----+----
+ *    2    | 3 |  2  |1|17|   55   | 10 | 10
+ * --------+---+-----+-+--+--------+----+----
+ *          ^
+ *
+ * For all other extents within the range, adjust the reference count
+ * or delete it if the refcount falls below 2.  If we were
+ * incrementing, the end result looks like this:
+ *
+ *      <------ adjustment range ------>
+ * --------+---+-----+-+--+--------+----+----
+ *    2    | 4 |  3  |2|18|   56   | 11 | 10
+ * --------+---+-----+-+--+--------+----+----
+ *
+ * The result of a decrement operation looks as such:
+ *
+ *      <------ adjustment range ------>
+ * ----+   +---+       +--+--------+----+----
+ *  2  |   | 2 |       |16|   54   |  9 | 10
+ * ----+   +---+       +--+--------+----+----
+ *      DDDD    111111DD
+ *
+ * The blocks marked "D" are freed; the blocks marked "1" are only
+ * referenced once and therefore the record is removed from the
+ * refcount btree.
+ */
+
+/* Next block after this extent. */
+static inline xfs_agblock_t
+xfs_refc_next(
+	struct xfs_refcount_irec	*rc)
+{
+	return rc->rc_startblock + rc->rc_blockcount;
+}
+
+/*
+ * Split a refcount extent that crosses agbno.
+ */
+STATIC int
+xfs_refcount_split_extent(
+	struct xfs_btree_cur		*cur,
+	xfs_agblock_t			agbno,
+	bool				*shape_changed)
+{
+	struct xfs_refcount_irec	rcext, tmp;
+	int				found_rec;
+	int				error;
+
+	*shape_changed = false;
+	error = xfs_refcount_lookup_le(cur, agbno, &found_rec);
+	if (error)
+		goto out_error;
+	if (!found_rec)
+		return 0;
+
+	error = xfs_refcount_get_rec(cur, &rcext, &found_rec);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+	if (rcext.rc_startblock == agbno || xfs_refc_next(&rcext) <= agbno)
+		return 0;
+
+	*shape_changed = true;
+	trace_xfs_refcount_split_extent(cur->bc_mp, cur->bc_private.a.agno,
+			&rcext, agbno);
+
+	/* Establish the right extent. */
+	tmp = rcext;
+	tmp.rc_startblock = agbno;
+	tmp.rc_blockcount -= (agbno - rcext.rc_startblock);
+	error = xfs_refcount_update(cur, &tmp);
+	if (error)
+		goto out_error;
+
+	/* Insert the left extent. */
+	tmp = rcext;
+	tmp.rc_blockcount = agbno - rcext.rc_startblock;
+	error = xfs_refcount_insert(cur, &tmp, &found_rec);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+	return error;
+
+out_error:
+	trace_xfs_refcount_split_extent_error(cur->bc_mp,
+			cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Merge the left, center, and right extents.
+ */
+STATIC int
+xfs_refcount_merge_center_extents(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*left,
+	struct xfs_refcount_irec	*center,
+	struct xfs_refcount_irec	*right,
+	unsigned long long		extlen,
+	xfs_agblock_t			*agbno,
+	xfs_extlen_t			*aglen)
+{
+	int				error;
+	int				found_rec;
+
+	trace_xfs_refcount_merge_center_extents(cur->bc_mp,
+			cur->bc_private.a.agno, left, center, right);
+
+	/*
+	 * Make sure the center and right extents are not in the btree.
+	 * If the center extent was synthesized, the first delete call
+	 * removes the right extent and we skip the second deletion.
+	 * If center and right were in the btree, then the first delete
+	 * call removes the center and the second one removes the right
+	 * extent.
+	 */
+	error = xfs_refcount_lookup_ge(cur, center->rc_startblock,
+			&found_rec);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+	error = xfs_refcount_delete(cur, &found_rec);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+	if (center->rc_refcount > 1) {
+		error = xfs_refcount_delete(cur, &found_rec);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+				out_error);
+	}
+
+	/* Enlarge the left extent. */
+	error = xfs_refcount_lookup_le(cur, left->rc_startblock,
+			&found_rec);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+	left->rc_blockcount = extlen;
+	error = xfs_refcount_update(cur, left);
+	if (error)
+		goto out_error;
+
+	*aglen = 0;
+	return error;
+
+out_error:
+	trace_xfs_refcount_merge_center_extents_error(cur->bc_mp,
+			cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Merge with the left extent.
+ */
+STATIC int
+xfs_refcount_merge_left_extent(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*left,
+	struct xfs_refcount_irec	*cleft,
+	xfs_agblock_t			*agbno,
+	xfs_extlen_t			*aglen)
+{
+	int				error;
+	int				found_rec;
+
+	trace_xfs_refcount_merge_left_extent(cur->bc_mp,
+			cur->bc_private.a.agno, left, cleft);
+
+	/* If the extent at agbno (cleft) wasn't synthesized, remove it. */
+	if (cleft->rc_refcount > 1) {
+		error = xfs_refcount_lookup_le(cur, cleft->rc_startblock,
+				&found_rec);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+				out_error);
+
+		error = xfs_refcount_delete(cur, &found_rec);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+				out_error);
+	}
+
+	/* Enlarge the left extent. */
+	error = xfs_refcount_lookup_le(cur, left->rc_startblock,
+			&found_rec);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+	left->rc_blockcount += cleft->rc_blockcount;
+	error = xfs_refcount_update(cur, left);
+	if (error)
+		goto out_error;
+
+	*agbno += cleft->rc_blockcount;
+	*aglen -= cleft->rc_blockcount;
+	return error;
+
+out_error:
+	trace_xfs_refcount_merge_left_extent_error(cur->bc_mp,
+			cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Merge with the right extent.
+ */
+STATIC int
+xfs_refcount_merge_right_extent(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*right,
+	struct xfs_refcount_irec	*cright,
+	xfs_agblock_t			*agbno,
+	xfs_extlen_t			*aglen)
+{
+	int				error;
+	int				found_rec;
+
+	trace_xfs_refcount_merge_right_extent(cur->bc_mp,
+			cur->bc_private.a.agno, cright, right);
+
+	/*
+	 * If the extent ending at agbno+aglen (cright) wasn't synthesized,
+	 * remove it.
+	 */
+	if (cright->rc_refcount > 1) {
+		error = xfs_refcount_lookup_le(cur, cright->rc_startblock,
+			&found_rec);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+				out_error);
+
+		error = xfs_refcount_delete(cur, &found_rec);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+				out_error);
+	}
+
+	/* Enlarge the right extent. */
+	error = xfs_refcount_lookup_le(cur, right->rc_startblock,
+			&found_rec);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+	right->rc_startblock -= cright->rc_blockcount;
+	right->rc_blockcount += cright->rc_blockcount;
+	error = xfs_refcount_update(cur, right);
+	if (error)
+		goto out_error;
+
+	*aglen -= cright->rc_blockcount;
+	return error;
+
+out_error:
+	trace_xfs_refcount_merge_right_extent_error(cur->bc_mp,
+			cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+#define XFS_FIND_RCEXT_SHARED	1
+#define XFS_FIND_RCEXT_COW	2
+/*
+ * Find the left extent and the one after it (cleft).  This function assumes
+ * that we've already split any extent crossing agbno.
+ */
+STATIC int
+xfs_refcount_find_left_extents(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*left,
+	struct xfs_refcount_irec	*cleft,
+	xfs_agblock_t			agbno,
+	xfs_extlen_t			aglen,
+	int				flags)
+{
+	struct xfs_refcount_irec	tmp;
+	int				error;
+	int				found_rec;
+
+	left->rc_startblock = cleft->rc_startblock = NULLAGBLOCK;
+	error = xfs_refcount_lookup_le(cur, agbno - 1, &found_rec);
+	if (error)
+		goto out_error;
+	if (!found_rec)
+		return 0;
+
+	error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+	if (xfs_refc_next(&tmp) != agbno)
+		return 0;
+	if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2)
+		return 0;
+	if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1)
+		return 0;
+	/* We have a left extent; retrieve (or invent) the next right one */
+	*left = tmp;
+
+	error = xfs_btree_increment(cur, 0, &found_rec);
+	if (error)
+		goto out_error;
+	if (found_rec) {
+		error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+				out_error);
+
+		/* if tmp starts at the end of our range, just use that */
+		if (tmp.rc_startblock == agbno)
+			*cleft = tmp;
+		else {
+			/*
+			 * There's a gap in the refcntbt at the start of the
+			 * range we're interested in (refcount == 1) so
+			 * synthesize the implied extent and pass it back.
+			 * We assume here that the agbno/aglen range was
+			 * passed in from a data fork extent mapping and
+			 * therefore is allocated to exactly one owner.
+			 */
+			cleft->rc_startblock = agbno;
+			cleft->rc_blockcount = min(aglen,
+					tmp.rc_startblock - agbno);
+			cleft->rc_refcount = 1;
+		}
+	} else {
+		/*
+		 * No extents, so pretend that there's one covering the whole
+		 * range.
+		 */
+		cleft->rc_startblock = agbno;
+		cleft->rc_blockcount = aglen;
+		cleft->rc_refcount = 1;
+	}
+	trace_xfs_refcount_find_left_extent(cur->bc_mp, cur->bc_private.a.agno,
+			left, cleft, agbno);
+	return error;
+
+out_error:
+	trace_xfs_refcount_find_left_extent_error(cur->bc_mp,
+			cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Find the right extent and the one before it (cright).  This function
+ * assumes that we've already split any extents crossing agbno + aglen.
+ */
+STATIC int
+xfs_refcount_find_right_extents(
+	struct xfs_btree_cur		*cur,
+	struct xfs_refcount_irec	*right,
+	struct xfs_refcount_irec	*cright,
+	xfs_agblock_t			agbno,
+	xfs_extlen_t			aglen,
+	int				flags)
+{
+	struct xfs_refcount_irec	tmp;
+	int				error;
+	int				found_rec;
+
+	right->rc_startblock = cright->rc_startblock = NULLAGBLOCK;
+	error = xfs_refcount_lookup_ge(cur, agbno + aglen, &found_rec);
+	if (error)
+		goto out_error;
+	if (!found_rec)
+		return 0;
+
+	error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1, out_error);
+
+	if (tmp.rc_startblock != agbno + aglen)
+		return 0;
+	if ((flags & XFS_FIND_RCEXT_SHARED) && tmp.rc_refcount < 2)
+		return 0;
+	if ((flags & XFS_FIND_RCEXT_COW) && tmp.rc_refcount > 1)
+		return 0;
+	/* We have a right extent; retrieve (or invent) the next left one */
+	*right = tmp;
+
+	error = xfs_btree_decrement(cur, 0, &found_rec);
+	if (error)
+		goto out_error;
+	if (found_rec) {
+		error = xfs_refcount_get_rec(cur, &tmp, &found_rec);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, found_rec == 1,
+				out_error);
+
+		/* if tmp ends at the end of our range, just use that */
+		if (xfs_refc_next(&tmp) == agbno + aglen)
+			*cright = tmp;
+		else {
+			/*
+			 * There's a gap in the refcntbt at the end of the
+			 * range we're interested in (refcount == 1) so
+			 * create the implied extent and pass it back.
+			 * We assume here that the agbno/aglen range was
+			 * passed in from a data fork extent mapping and
+			 * therefore is allocated to exactly one owner.
+			 */
+			cright->rc_startblock = max(agbno, xfs_refc_next(&tmp));
+			cright->rc_blockcount = right->rc_startblock -
+					cright->rc_startblock;
+			cright->rc_refcount = 1;
+		}
+	} else {
+		/*
+		 * No extents, so pretend that there's one covering the whole
+		 * range.
+		 */
+		cright->rc_startblock = agbno;
+		cright->rc_blockcount = aglen;
+		cright->rc_refcount = 1;
+	}
+	trace_xfs_refcount_find_right_extent(cur->bc_mp, cur->bc_private.a.agno,
+			cright, right, agbno + aglen);
+	return error;
+
+out_error:
+	trace_xfs_refcount_find_right_extent_error(cur->bc_mp,
+			cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/* Is this extent valid? */
+static inline bool
+xfs_refc_valid(
+	struct xfs_refcount_irec	*rc)
+{
+	return rc->rc_startblock != NULLAGBLOCK;
+}
+
+/*
+ * Try to merge with any extents on the boundaries of the adjustment range.
+ */
+STATIC int
+xfs_refcount_merge_extents(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		*agbno,
+	xfs_extlen_t		*aglen,
+	enum xfs_refc_adjust_op adjust,
+	int			flags,
+	bool			*shape_changed)
+{
+	struct xfs_refcount_irec	left = {0}, cleft = {0};
+	struct xfs_refcount_irec	cright = {0}, right = {0};
+	int				error;
+	unsigned long long		ulen;
+	bool				cequal;
+
+	*shape_changed = false;
+	/*
+	 * Find the extent just below agbno [left], just above agbno [cleft],
+	 * just below (agbno + aglen) [cright], and just above (agbno + aglen)
+	 * [right].
+	 */
+	error = xfs_refcount_find_left_extents(cur, &left, &cleft, *agbno,
+			*aglen, flags);
+	if (error)
+		return error;
+	error = xfs_refcount_find_right_extents(cur, &right, &cright, *agbno,
+			*aglen, flags);
+	if (error)
+		return error;
+
+	/* No left or right extent to merge; exit. */
+	if (!xfs_refc_valid(&left) && !xfs_refc_valid(&right))
+		return 0;
+
+	cequal = (cleft.rc_startblock == cright.rc_startblock) &&
+		 (cleft.rc_blockcount == cright.rc_blockcount);
+
+	/* Try to merge left, cleft, and right.  cleft must == cright. */
+	ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount +
+			right.rc_blockcount;
+	if (xfs_refc_valid(&left) && xfs_refc_valid(&right) &&
+	    xfs_refc_valid(&cleft) && xfs_refc_valid(&cright) && cequal &&
+	    left.rc_refcount == cleft.rc_refcount + adjust &&
+	    right.rc_refcount == cleft.rc_refcount + adjust &&
+	    ulen < MAXREFCEXTLEN) {
+		*shape_changed = true;
+		return xfs_refcount_merge_center_extents(cur, &left, &cleft,
+				&right, ulen, agbno, aglen);
+	}
+
+	/* Try to merge left and cleft. */
+	ulen = (unsigned long long)left.rc_blockcount + cleft.rc_blockcount;
+	if (xfs_refc_valid(&left) && xfs_refc_valid(&cleft) &&
+	    left.rc_refcount == cleft.rc_refcount + adjust &&
+	    ulen < MAXREFCEXTLEN) {
+		*shape_changed = true;
+		error = xfs_refcount_merge_left_extent(cur, &left, &cleft,
+				agbno, aglen);
+		if (error)
+			return error;
+
+		/*
+		 * If we just merged left + cleft and cleft == cright,
+		 * we no longer have a cright to merge with right.  We're done.
+		 */
+		if (cequal)
+			return 0;
+	}
+
+	/* Try to merge cright and right. */
+	ulen = (unsigned long long)right.rc_blockcount + cright.rc_blockcount;
+	if (xfs_refc_valid(&right) && xfs_refc_valid(&cright) &&
+	    right.rc_refcount == cright.rc_refcount + adjust &&
+	    ulen < MAXREFCEXTLEN) {
+		*shape_changed = true;
+		return xfs_refcount_merge_right_extent(cur, &right, &cright,
+				agbno, aglen);
+	}
+
+	return error;
+}
+
+/*
+ * While we're adjusting the refcounts records of an extent, we have
+ * to keep an eye on the number of extents we're dirtying -- run too
+ * many in a single transaction and we'll exceed the transaction's
+ * reservation and crash the fs.  Each record adds 12 bytes to the
+ * log (plus any key updates) so we'll conservatively assume 24 bytes
+ * per record.  We must also leave space for btree splits on both ends
+ * of the range and space for the CUD and a new CUI.
+ *
+ * XXX: This is a pretty hand-wavy estimate.  The penalty for guessing
+ * true incorrectly is a shutdown FS; the penalty for guessing false
+ * incorrectly is more transaction rolls than might be necessary.
+ * Be conservative here.
+ */
+static bool
+xfs_refcount_still_have_space(
+	struct xfs_btree_cur		*cur)
+{
+	unsigned long			overhead;
+
+	overhead = cur->bc_private.a.priv.refc.shape_changes *
+			xfs_allocfree_log_count(cur->bc_mp, 1);
+	overhead *= cur->bc_mp->m_sb.sb_blocksize;
+
+	/*
+	 * Only allow 2 refcount extent updates per transaction if the
+	 * refcount continue update "error" has been injected.
+	 */
+	if (cur->bc_private.a.priv.refc.nr_ops > 2 &&
+	    XFS_TEST_ERROR(false, cur->bc_mp,
+			XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE,
+			XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE))
+		return false;
+
+	if (cur->bc_private.a.priv.refc.nr_ops == 0)
+		return true;
+	else if (overhead > cur->bc_tp->t_log_res)
+		return false;
+	return  cur->bc_tp->t_log_res - overhead >
+		cur->bc_private.a.priv.refc.nr_ops * 32;
+}
+
+/*
+ * Adjust the refcounts of middle extents.  At this point we should have
+ * split extents that crossed the adjustment range; merged with adjacent
+ * extents; and updated agbno/aglen to reflect the merges.  Therefore,
+ * all we have to do is update the extents inside [agbno, agbno + aglen].
+ */
+STATIC int
+xfs_refcount_adjust_extents(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		*agbno,
+	xfs_extlen_t		*aglen,
+	enum xfs_refc_adjust_op	adj,
+	struct xfs_defer_ops	*dfops,
+	struct xfs_owner_info	*oinfo)
+{
+	struct xfs_refcount_irec	ext, tmp;
+	int				error;
+	int				found_rec, found_tmp;
+	xfs_fsblock_t			fsbno;
+
+	/* Merging did all the work already. */
+	if (*aglen == 0)
+		return 0;
+
+	error = xfs_refcount_lookup_ge(cur, *agbno, &found_rec);
+	if (error)
+		goto out_error;
+
+	while (*aglen > 0 && xfs_refcount_still_have_space(cur)) {
+		error = xfs_refcount_get_rec(cur, &ext, &found_rec);
+		if (error)
+			goto out_error;
+		if (!found_rec) {
+			ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks;
+			ext.rc_blockcount = 0;
+			ext.rc_refcount = 0;
+		}
+
+		/*
+		 * Deal with a hole in the refcount tree; if a file maps to
+		 * these blocks and there's no refcountbt record, pretend that
+		 * there is one with refcount == 1.
+		 */
+		if (ext.rc_startblock != *agbno) {
+			tmp.rc_startblock = *agbno;
+			tmp.rc_blockcount = min(*aglen,
+					ext.rc_startblock - *agbno);
+			tmp.rc_refcount = 1 + adj;
+			trace_xfs_refcount_modify_extent(cur->bc_mp,
+					cur->bc_private.a.agno, &tmp);
+
+			/*
+			 * Either cover the hole (increment) or
+			 * delete the range (decrement).
+			 */
+			if (tmp.rc_refcount) {
+				error = xfs_refcount_insert(cur, &tmp,
+						&found_tmp);
+				if (error)
+					goto out_error;
+				XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+						found_tmp == 1, out_error);
+				cur->bc_private.a.priv.refc.nr_ops++;
+			} else {
+				fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
+						cur->bc_private.a.agno,
+						tmp.rc_startblock);
+				xfs_bmap_add_free(cur->bc_mp, dfops, fsbno,
+						tmp.rc_blockcount, oinfo);
+			}
+
+			(*agbno) += tmp.rc_blockcount;
+			(*aglen) -= tmp.rc_blockcount;
+
+			error = xfs_refcount_lookup_ge(cur, *agbno,
+					&found_rec);
+			if (error)
+				goto out_error;
+		}
+
+		/* Stop if there's nothing left to modify */
+		if (*aglen == 0 || !xfs_refcount_still_have_space(cur))
+			break;
+
+		/*
+		 * Adjust the reference count and either update the tree
+		 * (incr) or free the blocks (decr).
+		 */
+		if (ext.rc_refcount == MAXREFCOUNT)
+			goto skip;
+		ext.rc_refcount += adj;
+		trace_xfs_refcount_modify_extent(cur->bc_mp,
+				cur->bc_private.a.agno, &ext);
+		if (ext.rc_refcount > 1) {
+			error = xfs_refcount_update(cur, &ext);
+			if (error)
+				goto out_error;
+			cur->bc_private.a.priv.refc.nr_ops++;
+		} else if (ext.rc_refcount == 1) {
+			error = xfs_refcount_delete(cur, &found_rec);
+			if (error)
+				goto out_error;
+			XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+					found_rec == 1, out_error);
+			cur->bc_private.a.priv.refc.nr_ops++;
+			goto advloop;
+		} else {
+			fsbno = XFS_AGB_TO_FSB(cur->bc_mp,
+					cur->bc_private.a.agno,
+					ext.rc_startblock);
+			xfs_bmap_add_free(cur->bc_mp, dfops, fsbno,
+					ext.rc_blockcount, oinfo);
+		}
+
+skip:
+		error = xfs_btree_increment(cur, 0, &found_rec);
+		if (error)
+			goto out_error;
+
+advloop:
+		(*agbno) += ext.rc_blockcount;
+		(*aglen) -= ext.rc_blockcount;
+	}
+
+	return error;
+out_error:
+	trace_xfs_refcount_modify_extent_error(cur->bc_mp,
+			cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/* Adjust the reference count of a range of AG blocks. */
+STATIC int
+xfs_refcount_adjust(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		aglen,
+	xfs_agblock_t		*new_agbno,
+	xfs_extlen_t		*new_aglen,
+	enum xfs_refc_adjust_op	adj,
+	struct xfs_defer_ops	*dfops,
+	struct xfs_owner_info	*oinfo)
+{
+	bool			shape_changed;
+	int			shape_changes = 0;
+	int			error;
+
+	*new_agbno = agbno;
+	*new_aglen = aglen;
+	if (adj == XFS_REFCOUNT_ADJUST_INCREASE)
+		trace_xfs_refcount_increase(cur->bc_mp, cur->bc_private.a.agno,
+				agbno, aglen);
+	else
+		trace_xfs_refcount_decrease(cur->bc_mp, cur->bc_private.a.agno,
+				agbno, aglen);
+
+	/*
+	 * Ensure that no rcextents cross the boundary of the adjustment range.
+	 */
+	error = xfs_refcount_split_extent(cur, agbno, &shape_changed);
+	if (error)
+		goto out_error;
+	if (shape_changed)
+		shape_changes++;
+
+	error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed);
+	if (error)
+		goto out_error;
+	if (shape_changed)
+		shape_changes++;
+
+	/*
+	 * Try to merge with the left or right extents of the range.
+	 */
+	error = xfs_refcount_merge_extents(cur, new_agbno, new_aglen, adj,
+			XFS_FIND_RCEXT_SHARED, &shape_changed);
+	if (error)
+		goto out_error;
+	if (shape_changed)
+		shape_changes++;
+	if (shape_changes)
+		cur->bc_private.a.priv.refc.shape_changes++;
+
+	/* Now that we've taken care of the ends, adjust the middle extents */
+	error = xfs_refcount_adjust_extents(cur, new_agbno, new_aglen,
+			adj, dfops, oinfo);
+	if (error)
+		goto out_error;
+
+	return 0;
+
+out_error:
+	trace_xfs_refcount_adjust_error(cur->bc_mp, cur->bc_private.a.agno,
+			error, _RET_IP_);
+	return error;
+}
+
+/* Clean up after calling xfs_refcount_finish_one. */
+void
+xfs_refcount_finish_one_cleanup(
+	struct xfs_trans	*tp,
+	struct xfs_btree_cur	*rcur,
+	int			error)
+{
+	struct xfs_buf		*agbp;
+
+	if (rcur == NULL)
+		return;
+	agbp = rcur->bc_private.a.agbp;
+	xfs_btree_del_cursor(rcur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+	if (error)
+		xfs_trans_brelse(tp, agbp);
+}
+
+/*
+ * Process one of the deferred refcount operations.  We pass back the
+ * btree cursor to maintain our lock on the btree between calls.
+ * This saves time and eliminates a buffer deadlock between the
+ * superblock and the AGF because we'll always grab them in the same
+ * order.
+ */
+int
+xfs_refcount_finish_one(
+	struct xfs_trans		*tp,
+	struct xfs_defer_ops		*dfops,
+	enum xfs_refcount_intent_type	type,
+	xfs_fsblock_t			startblock,
+	xfs_extlen_t			blockcount,
+	xfs_fsblock_t			*new_fsb,
+	xfs_extlen_t			*new_len,
+	struct xfs_btree_cur		**pcur)
+{
+	struct xfs_mount		*mp = tp->t_mountp;
+	struct xfs_btree_cur		*rcur;
+	struct xfs_buf			*agbp = NULL;
+	int				error = 0;
+	xfs_agnumber_t			agno;
+	xfs_agblock_t			bno;
+	xfs_agblock_t			new_agbno;
+	unsigned long			nr_ops = 0;
+	int				shape_changes = 0;
+
+	agno = XFS_FSB_TO_AGNO(mp, startblock);
+	ASSERT(agno != NULLAGNUMBER);
+	bno = XFS_FSB_TO_AGBNO(mp, startblock);
+
+	trace_xfs_refcount_deferred(mp, XFS_FSB_TO_AGNO(mp, startblock),
+			type, XFS_FSB_TO_AGBNO(mp, startblock),
+			blockcount);
+
+	if (XFS_TEST_ERROR(false, mp,
+			XFS_ERRTAG_REFCOUNT_FINISH_ONE,
+			XFS_RANDOM_REFCOUNT_FINISH_ONE))
+		return -EIO;
+
+	/*
+	 * If we haven't gotten a cursor or the cursor AG doesn't match
+	 * the startblock, get one now.
+	 */
+	rcur = *pcur;
+	if (rcur != NULL && rcur->bc_private.a.agno != agno) {
+		nr_ops = rcur->bc_private.a.priv.refc.nr_ops;
+		shape_changes = rcur->bc_private.a.priv.refc.shape_changes;
+		xfs_refcount_finish_one_cleanup(tp, rcur, 0);
+		rcur = NULL;
+		*pcur = NULL;
+	}
+	if (rcur == NULL) {
+		error = xfs_alloc_read_agf(tp->t_mountp, tp, agno,
+				XFS_ALLOC_FLAG_FREEING, &agbp);
+		if (error)
+			return error;
+		if (!agbp)
+			return -EFSCORRUPTED;
+
+		rcur = xfs_refcountbt_init_cursor(mp, tp, agbp, agno, dfops);
+		if (!rcur) {
+			error = -ENOMEM;
+			goto out_cur;
+		}
+		rcur->bc_private.a.priv.refc.nr_ops = nr_ops;
+		rcur->bc_private.a.priv.refc.shape_changes = shape_changes;
+	}
+	*pcur = rcur;
+
+	switch (type) {
+	case XFS_REFCOUNT_INCREASE:
+		error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
+			new_len, XFS_REFCOUNT_ADJUST_INCREASE, dfops, NULL);
+		*new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno);
+		break;
+	case XFS_REFCOUNT_DECREASE:
+		error = xfs_refcount_adjust(rcur, bno, blockcount, &new_agbno,
+			new_len, XFS_REFCOUNT_ADJUST_DECREASE, dfops, NULL);
+		*new_fsb = XFS_AGB_TO_FSB(mp, agno, new_agbno);
+		break;
+	case XFS_REFCOUNT_ALLOC_COW:
+		*new_fsb = startblock + blockcount;
+		*new_len = 0;
+		error = __xfs_refcount_cow_alloc(rcur, bno, blockcount, dfops);
+		break;
+	case XFS_REFCOUNT_FREE_COW:
+		*new_fsb = startblock + blockcount;
+		*new_len = 0;
+		error = __xfs_refcount_cow_free(rcur, bno, blockcount, dfops);
+		break;
+	default:
+		ASSERT(0);
+		error = -EFSCORRUPTED;
+	}
+	if (!error && *new_len > 0)
+		trace_xfs_refcount_finish_one_leftover(mp, agno, type,
+				bno, blockcount, new_agbno, *new_len);
+	return error;
+
+out_cur:
+	xfs_trans_brelse(tp, agbp);
+
+	return error;
+}
+
+/*
+ * Record a refcount intent for later processing.
+ */
+static int
+__xfs_refcount_add(
+	struct xfs_mount		*mp,
+	struct xfs_defer_ops		*dfops,
+	enum xfs_refcount_intent_type	type,
+	xfs_fsblock_t			startblock,
+	xfs_extlen_t			blockcount)
+{
+	struct xfs_refcount_intent	*ri;
+
+	trace_xfs_refcount_defer(mp, XFS_FSB_TO_AGNO(mp, startblock),
+			type, XFS_FSB_TO_AGBNO(mp, startblock),
+			blockcount);
+
+	ri = kmem_alloc(sizeof(struct xfs_refcount_intent),
+			KM_SLEEP | KM_NOFS);
+	INIT_LIST_HEAD(&ri->ri_list);
+	ri->ri_type = type;
+	ri->ri_startblock = startblock;
+	ri->ri_blockcount = blockcount;
+
+	xfs_defer_add(dfops, XFS_DEFER_OPS_TYPE_REFCOUNT, &ri->ri_list);
+	return 0;
+}
+
+/*
+ * Increase the reference count of the blocks backing a file's extent.
+ */
+int
+xfs_refcount_increase_extent(
+	struct xfs_mount		*mp,
+	struct xfs_defer_ops		*dfops,
+	struct xfs_bmbt_irec		*PREV)
+{
+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
+		return 0;
+
+	return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_INCREASE,
+			PREV->br_startblock, PREV->br_blockcount);
+}
+
+/*
+ * Decrease the reference count of the blocks backing a file's extent.
+ */
+int
+xfs_refcount_decrease_extent(
+	struct xfs_mount		*mp,
+	struct xfs_defer_ops		*dfops,
+	struct xfs_bmbt_irec		*PREV)
+{
+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
+		return 0;
+
+	return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_DECREASE,
+			PREV->br_startblock, PREV->br_blockcount);
+}
+
+/*
+ * Given an AG extent, find the lowest-numbered run of shared blocks
+ * within that range and return the range in fbno/flen.  If
+ * find_end_of_shared is set, return the longest contiguous extent of
+ * shared blocks; if not, just return the first extent we find.  If no
+ * shared blocks are found, fbno and flen will be set to NULLAGBLOCK
+ * and 0, respectively.
+ */
+int
+xfs_refcount_find_shared(
+	struct xfs_btree_cur		*cur,
+	xfs_agblock_t			agbno,
+	xfs_extlen_t			aglen,
+	xfs_agblock_t			*fbno,
+	xfs_extlen_t			*flen,
+	bool				find_end_of_shared)
+{
+	struct xfs_refcount_irec	tmp;
+	int				i;
+	int				have;
+	int				error;
+
+	trace_xfs_refcount_find_shared(cur->bc_mp, cur->bc_private.a.agno,
+			agbno, aglen);
+
+	/* By default, skip the whole range */
+	*fbno = NULLAGBLOCK;
+	*flen = 0;
+
+	/* Try to find a refcount extent that crosses the start */
+	error = xfs_refcount_lookup_le(cur, agbno, &have);
+	if (error)
+		goto out_error;
+	if (!have) {
+		/* No left extent, look at the next one */
+		error = xfs_btree_increment(cur, 0, &have);
+		if (error)
+			goto out_error;
+		if (!have)
+			goto done;
+	}
+	error = xfs_refcount_get_rec(cur, &tmp, &i);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error);
+
+	/* If the extent ends before the start, look at the next one */
+	if (tmp.rc_startblock + tmp.rc_blockcount <= agbno) {
+		error = xfs_btree_increment(cur, 0, &have);
+		if (error)
+			goto out_error;
+		if (!have)
+			goto done;
+		error = xfs_refcount_get_rec(cur, &tmp, &i);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error);
+	}
+
+	/* If the extent starts after the range we want, bail out */
+	if (tmp.rc_startblock >= agbno + aglen)
+		goto done;
+
+	/* We found the start of a shared extent! */
+	if (tmp.rc_startblock < agbno) {
+		tmp.rc_blockcount -= (agbno - tmp.rc_startblock);
+		tmp.rc_startblock = agbno;
+	}
+
+	*fbno = tmp.rc_startblock;
+	*flen = min(tmp.rc_blockcount, agbno + aglen - *fbno);
+	if (!find_end_of_shared)
+		goto done;
+
+	/* Otherwise, find the end of this shared extent */
+	while (*fbno + *flen < agbno + aglen) {
+		error = xfs_btree_increment(cur, 0, &have);
+		if (error)
+			goto out_error;
+		if (!have)
+			break;
+		error = xfs_refcount_get_rec(cur, &tmp, &i);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp, i == 1, out_error);
+		if (tmp.rc_startblock >= agbno + aglen ||
+		    tmp.rc_startblock != *fbno + *flen)
+			break;
+		*flen = min(*flen + tmp.rc_blockcount, agbno + aglen - *fbno);
+	}
+
+done:
+	trace_xfs_refcount_find_shared_result(cur->bc_mp,
+			cur->bc_private.a.agno, *fbno, *flen);
+
+out_error:
+	if (error)
+		trace_xfs_refcount_find_shared_error(cur->bc_mp,
+				cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Recovering CoW Blocks After a Crash
+ *
+ * Due to the way that the copy on write mechanism works, there's a window of
+ * opportunity in which we can lose track of allocated blocks during a crash.
+ * Because CoW uses delayed allocation in the in-core CoW fork, writeback
+ * causes blocks to be allocated and stored in the CoW fork.  The blocks are
+ * no longer in the free space btree but are not otherwise recorded anywhere
+ * until the write completes and the blocks are mapped into the file.  A crash
+ * in between allocation and remapping results in the replacement blocks being
+ * lost.  This situation is exacerbated by the CoW extent size hint because
+ * allocations can hang around for long time.
+ *
+ * However, there is a place where we can record these allocations before they
+ * become mappings -- the reference count btree.  The btree does not record
+ * extents with refcount == 1, so we can record allocations with a refcount of
+ * 1.  Blocks being used for CoW writeout cannot be shared, so there should be
+ * no conflict with shared block records.  These mappings should be created
+ * when we allocate blocks to the CoW fork and deleted when they're removed
+ * from the CoW fork.
+ *
+ * Minor nit: records for in-progress CoW allocations and records for shared
+ * extents must never be merged, to preserve the property that (except for CoW
+ * allocations) there are no refcount btree entries with refcount == 1.  The
+ * only time this could potentially happen is when unsharing a block that's
+ * adjacent to CoW allocations, so we must be careful to avoid this.
+ *
+ * At mount time we recover lost CoW allocations by searching the refcount
+ * btree for these refcount == 1 mappings.  These represent CoW allocations
+ * that were in progress at the time the filesystem went down, so we can free
+ * them to get the space back.
+ *
+ * This mechanism is superior to creating EFIs for unmapped CoW extents for
+ * several reasons -- first, EFIs pin the tail of the log and would have to be
+ * periodically relogged to avoid filling up the log.  Second, CoW completions
+ * will have to file an EFD and create new EFIs for whatever remains in the
+ * CoW fork; this partially takes care of (1) but extent-size reservations
+ * will have to periodically relog even if there's no writeout in progress.
+ * This can happen if the CoW extent size hint is set, which you really want.
+ * Third, EFIs cannot currently be automatically relogged into newer
+ * transactions to advance the log tail.  Fourth, stuffing the log full of
+ * EFIs places an upper bound on the number of CoW allocations that can be
+ * held filesystem-wide at any given time.  Recording them in the refcount
+ * btree doesn't require us to maintain any state in memory and doesn't pin
+ * the log.
+ */
+/*
+ * Adjust the refcounts of CoW allocations.  These allocations are "magic"
+ * in that they're not referenced anywhere else in the filesystem, so we
+ * stash them in the refcount btree with a refcount of 1 until either file
+ * remapping (or CoW cancellation) happens.
+ */
+STATIC int
+xfs_refcount_adjust_cow_extents(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		aglen,
+	enum xfs_refc_adjust_op	adj,
+	struct xfs_defer_ops	*dfops,
+	struct xfs_owner_info	*oinfo)
+{
+	struct xfs_refcount_irec	ext, tmp;
+	int				error;
+	int				found_rec, found_tmp;
+
+	if (aglen == 0)
+		return 0;
+
+	/* Find any overlapping refcount records */
+	error = xfs_refcount_lookup_ge(cur, agbno, &found_rec);
+	if (error)
+		goto out_error;
+	error = xfs_refcount_get_rec(cur, &ext, &found_rec);
+	if (error)
+		goto out_error;
+	if (!found_rec) {
+		ext.rc_startblock = cur->bc_mp->m_sb.sb_agblocks +
+				XFS_REFC_COW_START;
+		ext.rc_blockcount = 0;
+		ext.rc_refcount = 0;
+	}
+
+	switch (adj) {
+	case XFS_REFCOUNT_ADJUST_COW_ALLOC:
+		/* Adding a CoW reservation, there should be nothing here. */
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+				ext.rc_startblock >= agbno + aglen, out_error);
+
+		tmp.rc_startblock = agbno;
+		tmp.rc_blockcount = aglen;
+		tmp.rc_refcount = 1;
+		trace_xfs_refcount_modify_extent(cur->bc_mp,
+				cur->bc_private.a.agno, &tmp);
+
+		error = xfs_refcount_insert(cur, &tmp,
+				&found_tmp);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+				found_tmp == 1, out_error);
+		break;
+	case XFS_REFCOUNT_ADJUST_COW_FREE:
+		/* Removing a CoW reservation, there should be one extent. */
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+			ext.rc_startblock == agbno, out_error);
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+			ext.rc_blockcount == aglen, out_error);
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+			ext.rc_refcount == 1, out_error);
+
+		ext.rc_refcount = 0;
+		trace_xfs_refcount_modify_extent(cur->bc_mp,
+				cur->bc_private.a.agno, &ext);
+		error = xfs_refcount_delete(cur, &found_rec);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(cur->bc_mp,
+				found_rec == 1, out_error);
+		break;
+	default:
+		ASSERT(0);
+	}
+
+	return error;
+out_error:
+	trace_xfs_refcount_modify_extent_error(cur->bc_mp,
+			cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Add or remove refcount btree entries for CoW reservations.
+ */
+STATIC int
+xfs_refcount_adjust_cow(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		aglen,
+	enum xfs_refc_adjust_op	adj,
+	struct xfs_defer_ops	*dfops)
+{
+	bool			shape_changed;
+	int			error;
+
+	agbno += XFS_REFC_COW_START;
+
+	/*
+	 * Ensure that no rcextents cross the boundary of the adjustment range.
+	 */
+	error = xfs_refcount_split_extent(cur, agbno, &shape_changed);
+	if (error)
+		goto out_error;
+
+	error = xfs_refcount_split_extent(cur, agbno + aglen, &shape_changed);
+	if (error)
+		goto out_error;
+
+	/*
+	 * Try to merge with the left or right extents of the range.
+	 */
+	error = xfs_refcount_merge_extents(cur, &agbno, &aglen, adj,
+			XFS_FIND_RCEXT_COW, &shape_changed);
+	if (error)
+		goto out_error;
+
+	/* Now that we've taken care of the ends, adjust the middle extents */
+	error = xfs_refcount_adjust_cow_extents(cur, agbno, aglen, adj,
+			dfops, NULL);
+	if (error)
+		goto out_error;
+
+	return 0;
+
+out_error:
+	trace_xfs_refcount_adjust_cow_error(cur->bc_mp, cur->bc_private.a.agno,
+			error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Record a CoW allocation in the refcount btree.
+ */
+STATIC int
+__xfs_refcount_cow_alloc(
+	struct xfs_btree_cur	*rcur,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		aglen,
+	struct xfs_defer_ops	*dfops)
+{
+	int			error;
+
+	trace_xfs_refcount_cow_increase(rcur->bc_mp, rcur->bc_private.a.agno,
+			agbno, aglen);
+
+	/* Add refcount btree reservation */
+	error = xfs_refcount_adjust_cow(rcur, agbno, aglen,
+			XFS_REFCOUNT_ADJUST_COW_ALLOC, dfops);
+	if (error)
+		return error;
+
+	/* Add rmap entry */
+	if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) {
+		error = xfs_rmap_alloc_extent(rcur->bc_mp, dfops,
+				rcur->bc_private.a.agno,
+				agbno, aglen, XFS_RMAP_OWN_COW);
+		if (error)
+			return error;
+	}
+
+	return error;
+}
+
+/*
+ * Remove a CoW allocation from the refcount btree.
+ */
+STATIC int
+__xfs_refcount_cow_free(
+	struct xfs_btree_cur	*rcur,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		aglen,
+	struct xfs_defer_ops	*dfops)
+{
+	int			error;
+
+	trace_xfs_refcount_cow_decrease(rcur->bc_mp, rcur->bc_private.a.agno,
+			agbno, aglen);
+
+	/* Remove refcount btree reservation */
+	error = xfs_refcount_adjust_cow(rcur, agbno, aglen,
+			XFS_REFCOUNT_ADJUST_COW_FREE, dfops);
+	if (error)
+		return error;
+
+	/* Remove rmap entry */
+	if (xfs_sb_version_hasrmapbt(&rcur->bc_mp->m_sb)) {
+		error = xfs_rmap_free_extent(rcur->bc_mp, dfops,
+				rcur->bc_private.a.agno,
+				agbno, aglen, XFS_RMAP_OWN_COW);
+		if (error)
+			return error;
+	}
+
+	return error;
+}
+
+/* Record a CoW staging extent in the refcount btree. */
+int
+xfs_refcount_alloc_cow_extent(
+	struct xfs_mount		*mp,
+	struct xfs_defer_ops		*dfops,
+	xfs_fsblock_t			fsb,
+	xfs_extlen_t			len)
+{
+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
+		return 0;
+
+	return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_ALLOC_COW,
+			fsb, len);
+}
+
+/* Forget a CoW staging event in the refcount btree. */
+int
+xfs_refcount_free_cow_extent(
+	struct xfs_mount		*mp,
+	struct xfs_defer_ops		*dfops,
+	xfs_fsblock_t			fsb,
+	xfs_extlen_t			len)
+{
+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
+		return 0;
+
+	return __xfs_refcount_add(mp, dfops, XFS_REFCOUNT_FREE_COW,
+			fsb, len);
+}
+
+struct xfs_refcount_recovery {
+	struct list_head		rr_list;
+	struct xfs_refcount_irec	rr_rrec;
+};
+
+/* Stuff an extent on the recovery list. */
+STATIC int
+xfs_refcount_recover_extent(
+	struct xfs_btree_cur		*cur,
+	union xfs_btree_rec		*rec,
+	void				*priv)
+{
+	struct list_head		*debris = priv;
+	struct xfs_refcount_recovery	*rr;
+
+	if (be32_to_cpu(rec->refc.rc_refcount) != 1)
+		return -EFSCORRUPTED;
+
+	rr = kmem_alloc(sizeof(struct xfs_refcount_recovery), KM_SLEEP);
+	xfs_refcount_btrec_to_irec(rec, &rr->rr_rrec);
+	list_add_tail(&rr->rr_list, debris);
+
+	return 0;
+}
+
+/* Find and remove leftover CoW reservations. */
+int
+xfs_refcount_recover_cow_leftovers(
+	struct xfs_mount		*mp,
+	xfs_agnumber_t			agno)
+{
+	struct xfs_trans		*tp;
+	struct xfs_btree_cur		*cur;
+	struct xfs_buf			*agbp;
+	struct xfs_refcount_recovery	*rr, *n;
+	struct list_head		debris;
+	union xfs_btree_irec		low;
+	union xfs_btree_irec		high;
+	struct xfs_defer_ops		dfops;
+	xfs_fsblock_t			fsb;
+	xfs_agblock_t			agbno;
+	int				error;
+
+	if (mp->m_sb.sb_agblocks >= XFS_REFC_COW_START)
+		return -EOPNOTSUPP;
+
+	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+	if (error)
+		return error;
+	cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
+
+	/* Find all the leftover CoW staging extents. */
+	INIT_LIST_HEAD(&debris);
+	memset(&low, 0, sizeof(low));
+	memset(&high, 0, sizeof(high));
+	low.rc.rc_startblock = XFS_REFC_COW_START;
+	high.rc.rc_startblock = -1U;
+	error = xfs_btree_query_range(cur, &low, &high,
+			xfs_refcount_recover_extent, &debris);
+	if (error)
+		goto out_cursor;
+	xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR);
+	xfs_buf_relse(agbp);
+
+	/* Now iterate the list to free the leftovers */
+	list_for_each_entry(rr, &debris, rr_list) {
+		/* Set up transaction. */
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
+		if (error)
+			goto out_free;
+
+		trace_xfs_refcount_recover_extent(mp, agno, &rr->rr_rrec);
+
+		/* Free the orphan record */
+		xfs_defer_init(&dfops, &fsb);
+		agbno = rr->rr_rrec.rc_startblock - XFS_REFC_COW_START;
+		fsb = XFS_AGB_TO_FSB(mp, agno, agbno);
+		error = xfs_refcount_free_cow_extent(mp, &dfops, fsb,
+				rr->rr_rrec.rc_blockcount);
+		if (error)
+			goto out_defer;
+
+		/* Free the block. */
+		xfs_bmap_add_free(mp, &dfops, fsb,
+				rr->rr_rrec.rc_blockcount, NULL);
+
+		error = xfs_defer_finish(&tp, &dfops, NULL);
+		if (error)
+			goto out_defer;
+
+		error = xfs_trans_commit(tp);
+		if (error)
+			goto out_free;
+	}
+
+out_free:
+	/* Free the leftover list */
+	list_for_each_entry_safe(rr, n, &debris, rr_list) {
+		list_del(&rr->rr_list);
+		kmem_free(rr);
+	}
+	return error;
+
+out_cursor:
+	xfs_btree_del_cursor(cur, XFS_BTREE_ERROR);
+	xfs_buf_relse(agbp);
+	goto out_free;
+
+out_defer:
+	xfs_defer_cancel(&dfops);
+	xfs_trans_cancel(tp);
+	goto out_free;
+}
diff --git a/fs/xfs/libxfs/xfs_refcount.h b/fs/xfs/libxfs/xfs_refcount.h
new file mode 100644
index 000000000000..098dc668ab2c
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount.h
@@ -0,0 +1,70 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#ifndef __XFS_REFCOUNT_H__
+#define __XFS_REFCOUNT_H__
+
+extern int xfs_refcount_lookup_le(struct xfs_btree_cur *cur,
+		xfs_agblock_t bno, int *stat);
+extern int xfs_refcount_lookup_ge(struct xfs_btree_cur *cur,
+		xfs_agblock_t bno, int *stat);
+extern int xfs_refcount_get_rec(struct xfs_btree_cur *cur,
+		struct xfs_refcount_irec *irec, int *stat);
+
+enum xfs_refcount_intent_type {
+	XFS_REFCOUNT_INCREASE = 1,
+	XFS_REFCOUNT_DECREASE,
+	XFS_REFCOUNT_ALLOC_COW,
+	XFS_REFCOUNT_FREE_COW,
+};
+
+struct xfs_refcount_intent {
+	struct list_head			ri_list;
+	enum xfs_refcount_intent_type		ri_type;
+	xfs_fsblock_t				ri_startblock;
+	xfs_extlen_t				ri_blockcount;
+};
+
+extern int xfs_refcount_increase_extent(struct xfs_mount *mp,
+		struct xfs_defer_ops *dfops, struct xfs_bmbt_irec *irec);
+extern int xfs_refcount_decrease_extent(struct xfs_mount *mp,
+		struct xfs_defer_ops *dfops, struct xfs_bmbt_irec *irec);
+
+extern void xfs_refcount_finish_one_cleanup(struct xfs_trans *tp,
+		struct xfs_btree_cur *rcur, int error);
+extern int xfs_refcount_finish_one(struct xfs_trans *tp,
+		struct xfs_defer_ops *dfops, enum xfs_refcount_intent_type type,
+		xfs_fsblock_t startblock, xfs_extlen_t blockcount,
+		xfs_fsblock_t *new_fsb, xfs_extlen_t *new_len,
+		struct xfs_btree_cur **pcur);
+
+extern int xfs_refcount_find_shared(struct xfs_btree_cur *cur,
+		xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
+		xfs_extlen_t *flen, bool find_end_of_shared);
+
+extern int xfs_refcount_alloc_cow_extent(struct xfs_mount *mp,
+		struct xfs_defer_ops *dfops, xfs_fsblock_t fsb,
+		xfs_extlen_t len);
+extern int xfs_refcount_free_cow_extent(struct xfs_mount *mp,
+		struct xfs_defer_ops *dfops, xfs_fsblock_t fsb,
+		xfs_extlen_t len);
+extern int xfs_refcount_recover_cow_leftovers(struct xfs_mount *mp,
+		xfs_agnumber_t agno);
+
+#endif	/* __XFS_REFCOUNT_H__ */
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.c b/fs/xfs/libxfs/xfs_refcount_btree.c
new file mode 100644
index 000000000000..453bb2757ec2
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount_btree.c
@@ -0,0 +1,451 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_sb.h"
+#include "xfs_mount.h"
+#include "xfs_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_alloc.h"
+#include "xfs_error.h"
+#include "xfs_trace.h"
+#include "xfs_cksum.h"
+#include "xfs_trans.h"
+#include "xfs_bit.h"
+#include "xfs_rmap.h"
+
+static struct xfs_btree_cur *
+xfs_refcountbt_dup_cursor(
+	struct xfs_btree_cur	*cur)
+{
+	return xfs_refcountbt_init_cursor(cur->bc_mp, cur->bc_tp,
+			cur->bc_private.a.agbp, cur->bc_private.a.agno,
+			cur->bc_private.a.dfops);
+}
+
+STATIC void
+xfs_refcountbt_set_root(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr,
+	int			inc)
+{
+	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	xfs_agnumber_t		seqno = be32_to_cpu(agf->agf_seqno);
+	struct xfs_perag	*pag = xfs_perag_get(cur->bc_mp, seqno);
+
+	ASSERT(ptr->s != 0);
+
+	agf->agf_refcount_root = ptr->s;
+	be32_add_cpu(&agf->agf_refcount_level, inc);
+	pag->pagf_refcount_level += inc;
+	xfs_perag_put(pag);
+
+	xfs_alloc_log_agf(cur->bc_tp, agbp,
+			XFS_AGF_REFCOUNT_ROOT | XFS_AGF_REFCOUNT_LEVEL);
+}
+
+STATIC int
+xfs_refcountbt_alloc_block(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*start,
+	union xfs_btree_ptr	*new,
+	int			*stat)
+{
+	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	struct xfs_alloc_arg	args;		/* block allocation args */
+	int			error;		/* error return value */
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ENTRY);
+
+	memset(&args, 0, sizeof(args));
+	args.tp = cur->bc_tp;
+	args.mp = cur->bc_mp;
+	args.type = XFS_ALLOCTYPE_NEAR_BNO;
+	args.fsbno = XFS_AGB_TO_FSB(cur->bc_mp, cur->bc_private.a.agno,
+			xfs_refc_block(args.mp));
+	args.firstblock = args.fsbno;
+	xfs_rmap_ag_owner(&args.oinfo, XFS_RMAP_OWN_REFC);
+	args.minlen = args.maxlen = args.prod = 1;
+	args.resv = XFS_AG_RESV_METADATA;
+
+	error = xfs_alloc_vextent(&args);
+	if (error)
+		goto out_error;
+	trace_xfs_refcountbt_alloc_block(cur->bc_mp, cur->bc_private.a.agno,
+			args.agbno, 1);
+	if (args.fsbno == NULLFSBLOCK) {
+		XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+		*stat = 0;
+		return 0;
+	}
+	ASSERT(args.agno == cur->bc_private.a.agno);
+	ASSERT(args.len == 1);
+
+	new->s = cpu_to_be32(args.agbno);
+	be32_add_cpu(&agf->agf_refcount_blocks, 1);
+	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
+
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_EXIT);
+	*stat = 1;
+	return 0;
+
+out_error:
+	XFS_BTREE_TRACE_CURSOR(cur, XBT_ERROR);
+	return error;
+}
+
+STATIC int
+xfs_refcountbt_free_block(
+	struct xfs_btree_cur	*cur,
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_buf		*agbp = cur->bc_private.a.agbp;
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	xfs_fsblock_t		fsbno = XFS_DADDR_TO_FSB(mp, XFS_BUF_ADDR(bp));
+	struct xfs_owner_info	oinfo;
+	int			error;
+
+	trace_xfs_refcountbt_free_block(cur->bc_mp, cur->bc_private.a.agno,
+			XFS_FSB_TO_AGBNO(cur->bc_mp, fsbno), 1);
+	xfs_rmap_ag_owner(&oinfo, XFS_RMAP_OWN_REFC);
+	be32_add_cpu(&agf->agf_refcount_blocks, -1);
+	xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_REFCOUNT_BLOCKS);
+	error = xfs_free_extent(cur->bc_tp, fsbno, 1, &oinfo,
+			XFS_AG_RESV_METADATA);
+	if (error)
+		return error;
+
+	return error;
+}
+
+STATIC int
+xfs_refcountbt_get_minrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_refc_mnr[level != 0];
+}
+
+STATIC int
+xfs_refcountbt_get_maxrecs(
+	struct xfs_btree_cur	*cur,
+	int			level)
+{
+	return cur->bc_mp->m_refc_mxr[level != 0];
+}
+
+STATIC void
+xfs_refcountbt_init_key_from_rec(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	key->refc.rc_startblock = rec->refc.rc_startblock;
+}
+
+STATIC void
+xfs_refcountbt_init_high_key_from_rec(
+	union xfs_btree_key	*key,
+	union xfs_btree_rec	*rec)
+{
+	__u32			x;
+
+	x = be32_to_cpu(rec->refc.rc_startblock);
+	x += be32_to_cpu(rec->refc.rc_blockcount) - 1;
+	key->refc.rc_startblock = cpu_to_be32(x);
+}
+
+STATIC void
+xfs_refcountbt_init_rec_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*rec)
+{
+	rec->refc.rc_startblock = cpu_to_be32(cur->bc_rec.rc.rc_startblock);
+	rec->refc.rc_blockcount = cpu_to_be32(cur->bc_rec.rc.rc_blockcount);
+	rec->refc.rc_refcount = cpu_to_be32(cur->bc_rec.rc.rc_refcount);
+}
+
+STATIC void
+xfs_refcountbt_init_ptr_from_cur(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_ptr	*ptr)
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(cur->bc_private.a.agbp);
+
+	ASSERT(cur->bc_private.a.agno == be32_to_cpu(agf->agf_seqno));
+	ASSERT(agf->agf_refcount_root != 0);
+
+	ptr->s = agf->agf_refcount_root;
+}
+
+STATIC __int64_t
+xfs_refcountbt_key_diff(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*key)
+{
+	struct xfs_refcount_irec	*rec = &cur->bc_rec.rc;
+	struct xfs_refcount_key		*kp = &key->refc;
+
+	return (__int64_t)be32_to_cpu(kp->rc_startblock) - rec->rc_startblock;
+}
+
+STATIC __int64_t
+xfs_refcountbt_diff_two_keys(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*k1,
+	union xfs_btree_key	*k2)
+{
+	return (__int64_t)be32_to_cpu(k1->refc.rc_startblock) -
+			  be32_to_cpu(k2->refc.rc_startblock);
+}
+
+STATIC bool
+xfs_refcountbt_verify(
+	struct xfs_buf		*bp)
+{
+	struct xfs_mount	*mp = bp->b_target->bt_mount;
+	struct xfs_btree_block	*block = XFS_BUF_TO_BLOCK(bp);
+	struct xfs_perag	*pag = bp->b_pag;
+	unsigned int		level;
+
+	if (block->bb_magic != cpu_to_be32(XFS_REFC_CRC_MAGIC))
+		return false;
+
+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
+		return false;
+	if (!xfs_btree_sblock_v5hdr_verify(bp))
+		return false;
+
+	level = be16_to_cpu(block->bb_level);
+	if (pag && pag->pagf_init) {
+		if (level >= pag->pagf_refcount_level)
+			return false;
+	} else if (level >= mp->m_refc_maxlevels)
+		return false;
+
+	return xfs_btree_sblock_verify(bp, mp->m_refc_mxr[level != 0]);
+}
+
+STATIC void
+xfs_refcountbt_read_verify(
+	struct xfs_buf	*bp)
+{
+	if (!xfs_btree_sblock_verify_crc(bp))
+		xfs_buf_ioerror(bp, -EFSBADCRC);
+	else if (!xfs_refcountbt_verify(bp))
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+
+	if (bp->b_error) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_verifier_error(bp);
+	}
+}
+
+STATIC void
+xfs_refcountbt_write_verify(
+	struct xfs_buf	*bp)
+{
+	if (!xfs_refcountbt_verify(bp)) {
+		trace_xfs_btree_corrupt(bp, _RET_IP_);
+		xfs_buf_ioerror(bp, -EFSCORRUPTED);
+		xfs_verifier_error(bp);
+		return;
+	}
+	xfs_btree_sblock_calc_crc(bp);
+
+}
+
+const struct xfs_buf_ops xfs_refcountbt_buf_ops = {
+	.name			= "xfs_refcountbt",
+	.verify_read		= xfs_refcountbt_read_verify,
+	.verify_write		= xfs_refcountbt_write_verify,
+};
+
+#if defined(DEBUG) || defined(XFS_WARN)
+STATIC int
+xfs_refcountbt_keys_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_key	*k1,
+	union xfs_btree_key	*k2)
+{
+	return be32_to_cpu(k1->refc.rc_startblock) <
+	       be32_to_cpu(k2->refc.rc_startblock);
+}
+
+STATIC int
+xfs_refcountbt_recs_inorder(
+	struct xfs_btree_cur	*cur,
+	union xfs_btree_rec	*r1,
+	union xfs_btree_rec	*r2)
+{
+	return  be32_to_cpu(r1->refc.rc_startblock) +
+		be32_to_cpu(r1->refc.rc_blockcount) <=
+		be32_to_cpu(r2->refc.rc_startblock);
+}
+#endif
+
+static const struct xfs_btree_ops xfs_refcountbt_ops = {
+	.rec_len		= sizeof(struct xfs_refcount_rec),
+	.key_len		= sizeof(struct xfs_refcount_key),
+
+	.dup_cursor		= xfs_refcountbt_dup_cursor,
+	.set_root		= xfs_refcountbt_set_root,
+	.alloc_block		= xfs_refcountbt_alloc_block,
+	.free_block		= xfs_refcountbt_free_block,
+	.get_minrecs		= xfs_refcountbt_get_minrecs,
+	.get_maxrecs		= xfs_refcountbt_get_maxrecs,
+	.init_key_from_rec	= xfs_refcountbt_init_key_from_rec,
+	.init_high_key_from_rec	= xfs_refcountbt_init_high_key_from_rec,
+	.init_rec_from_cur	= xfs_refcountbt_init_rec_from_cur,
+	.init_ptr_from_cur	= xfs_refcountbt_init_ptr_from_cur,
+	.key_diff		= xfs_refcountbt_key_diff,
+	.buf_ops		= &xfs_refcountbt_buf_ops,
+	.diff_two_keys		= xfs_refcountbt_diff_two_keys,
+#if defined(DEBUG) || defined(XFS_WARN)
+	.keys_inorder		= xfs_refcountbt_keys_inorder,
+	.recs_inorder		= xfs_refcountbt_recs_inorder,
+#endif
+};
+
+/*
+ * Allocate a new refcount btree cursor.
+ */
+struct xfs_btree_cur *
+xfs_refcountbt_init_cursor(
+	struct xfs_mount	*mp,
+	struct xfs_trans	*tp,
+	struct xfs_buf		*agbp,
+	xfs_agnumber_t		agno,
+	struct xfs_defer_ops	*dfops)
+{
+	struct xfs_agf		*agf = XFS_BUF_TO_AGF(agbp);
+	struct xfs_btree_cur	*cur;
+
+	ASSERT(agno != NULLAGNUMBER);
+	ASSERT(agno < mp->m_sb.sb_agcount);
+	cur = kmem_zone_zalloc(xfs_btree_cur_zone, KM_NOFS);
+
+	cur->bc_tp = tp;
+	cur->bc_mp = mp;
+	cur->bc_btnum = XFS_BTNUM_REFC;
+	cur->bc_blocklog = mp->m_sb.sb_blocklog;
+	cur->bc_ops = &xfs_refcountbt_ops;
+
+	cur->bc_nlevels = be32_to_cpu(agf->agf_refcount_level);
+
+	cur->bc_private.a.agbp = agbp;
+	cur->bc_private.a.agno = agno;
+	cur->bc_private.a.dfops = dfops;
+	cur->bc_flags |= XFS_BTREE_CRC_BLOCKS;
+
+	cur->bc_private.a.priv.refc.nr_ops = 0;
+	cur->bc_private.a.priv.refc.shape_changes = 0;
+
+	return cur;
+}
+
+/*
+ * Calculate the number of records in a refcount btree block.
+ */
+int
+xfs_refcountbt_maxrecs(
+	struct xfs_mount	*mp,
+	int			blocklen,
+	bool			leaf)
+{
+	blocklen -= XFS_REFCOUNT_BLOCK_LEN;
+
+	if (leaf)
+		return blocklen / sizeof(struct xfs_refcount_rec);
+	return blocklen / (sizeof(struct xfs_refcount_key) +
+			   sizeof(xfs_refcount_ptr_t));
+}
+
+/* Compute the maximum height of a refcount btree. */
+void
+xfs_refcountbt_compute_maxlevels(
+	struct xfs_mount		*mp)
+{
+	mp->m_refc_maxlevels = xfs_btree_compute_maxlevels(mp,
+			mp->m_refc_mnr, mp->m_sb.sb_agblocks);
+}
+
+/* Calculate the refcount btree size for some records. */
+xfs_extlen_t
+xfs_refcountbt_calc_size(
+	struct xfs_mount	*mp,
+	unsigned long long	len)
+{
+	return xfs_btree_calc_size(mp, mp->m_refc_mnr, len);
+}
+
+/*
+ * Calculate the maximum refcount btree size.
+ */
+xfs_extlen_t
+xfs_refcountbt_max_size(
+	struct xfs_mount	*mp)
+{
+	/* Bail out if we're uninitialized, which can happen in mkfs. */
+	if (mp->m_refc_mxr[0] == 0)
+		return 0;
+
+	return xfs_refcountbt_calc_size(mp, mp->m_sb.sb_agblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+int
+xfs_refcountbt_calc_reserves(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_extlen_t		*ask,
+	xfs_extlen_t		*used)
+{
+	struct xfs_buf		*agbp;
+	struct xfs_agf		*agf;
+	xfs_extlen_t		tree_len;
+	int			error;
+
+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
+		return 0;
+
+	*ask += xfs_refcountbt_max_size(mp);
+
+	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+	if (error)
+		return error;
+
+	agf = XFS_BUF_TO_AGF(agbp);
+	tree_len = be32_to_cpu(agf->agf_refcount_blocks);
+	xfs_buf_relse(agbp);
+
+	*used += tree_len;
+
+	return error;
+}
diff --git a/fs/xfs/libxfs/xfs_refcount_btree.h b/fs/xfs/libxfs/xfs_refcount_btree.h
new file mode 100644
index 000000000000..3be7768bd51a
--- /dev/null
+++ b/fs/xfs/libxfs/xfs_refcount_btree.h
@@ -0,0 +1,74 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#ifndef __XFS_REFCOUNT_BTREE_H__
+#define	__XFS_REFCOUNT_BTREE_H__
+
+/*
+ * Reference Count Btree on-disk structures
+ */
+
+struct xfs_buf;
+struct xfs_btree_cur;
+struct xfs_mount;
+
+/*
+ * Btree block header size
+ */
+#define XFS_REFCOUNT_BLOCK_LEN	XFS_BTREE_SBLOCK_CRC_LEN
+
+/*
+ * Record, key, and pointer address macros for btree blocks.
+ *
+ * (note that some of these may appear unused, but they are used in userspace)
+ */
+#define XFS_REFCOUNT_REC_ADDR(block, index) \
+	((struct xfs_refcount_rec *) \
+		((char *)(block) + \
+		 XFS_REFCOUNT_BLOCK_LEN + \
+		 (((index) - 1) * sizeof(struct xfs_refcount_rec))))
+
+#define XFS_REFCOUNT_KEY_ADDR(block, index) \
+	((struct xfs_refcount_key *) \
+		((char *)(block) + \
+		 XFS_REFCOUNT_BLOCK_LEN + \
+		 ((index) - 1) * sizeof(struct xfs_refcount_key)))
+
+#define XFS_REFCOUNT_PTR_ADDR(block, index, maxrecs) \
+	((xfs_refcount_ptr_t *) \
+		((char *)(block) + \
+		 XFS_REFCOUNT_BLOCK_LEN + \
+		 (maxrecs) * sizeof(struct xfs_refcount_key) + \
+		 ((index) - 1) * sizeof(xfs_refcount_ptr_t)))
+
+extern struct xfs_btree_cur *xfs_refcountbt_init_cursor(struct xfs_mount *mp,
+		struct xfs_trans *tp, struct xfs_buf *agbp, xfs_agnumber_t agno,
+		struct xfs_defer_ops *dfops);
+extern int xfs_refcountbt_maxrecs(struct xfs_mount *mp, int blocklen,
+		bool leaf);
+extern void xfs_refcountbt_compute_maxlevels(struct xfs_mount *mp);
+
+extern xfs_extlen_t xfs_refcountbt_calc_size(struct xfs_mount *mp,
+		unsigned long long len);
+extern xfs_extlen_t xfs_refcountbt_max_size(struct xfs_mount *mp);
+
+extern int xfs_refcountbt_calc_reserves(struct xfs_mount *mp,
+		xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
+
+#endif	/* __XFS_REFCOUNT_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap.c b/fs/xfs/libxfs/xfs_rmap.c
index 73d05407d663..3a8cc7139912 100644
--- a/fs/xfs/libxfs/xfs_rmap.c
+++ b/fs/xfs/libxfs/xfs_rmap.c
@@ -148,6 +148,37 @@ done:
 	return error;
 }
 
+STATIC int
+xfs_rmap_delete(
+	struct xfs_btree_cur	*rcur,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		len,
+	uint64_t		owner,
+	uint64_t		offset,
+	unsigned int		flags)
+{
+	int			i;
+	int			error;
+
+	trace_xfs_rmap_delete(rcur->bc_mp, rcur->bc_private.a.agno, agbno,
+			len, owner, offset, flags);
+
+	error = xfs_rmap_lookup_eq(rcur, agbno, len, owner, offset, flags, &i);
+	if (error)
+		goto done;
+	XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done);
+
+	error = xfs_btree_delete(rcur, &i);
+	if (error)
+		goto done;
+	XFS_WANT_CORRUPTED_GOTO(rcur->bc_mp, i == 1, done);
+done:
+	if (error)
+		trace_xfs_rmap_delete_error(rcur->bc_mp,
+				rcur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
 static int
 xfs_rmap_btrec_to_irec(
 	union xfs_btree_rec	*rec,
@@ -180,6 +211,160 @@ xfs_rmap_get_rec(
 	return xfs_rmap_btrec_to_irec(rec, irec);
 }
 
+struct xfs_find_left_neighbor_info {
+	struct xfs_rmap_irec	high;
+	struct xfs_rmap_irec	*irec;
+	int			*stat;
+};
+
+/* For each rmap given, figure out if it matches the key we want. */
+STATIC int
+xfs_rmap_find_left_neighbor_helper(
+	struct xfs_btree_cur	*cur,
+	struct xfs_rmap_irec	*rec,
+	void			*priv)
+{
+	struct xfs_find_left_neighbor_info	*info = priv;
+
+	trace_xfs_rmap_find_left_neighbor_candidate(cur->bc_mp,
+			cur->bc_private.a.agno, rec->rm_startblock,
+			rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
+			rec->rm_flags);
+
+	if (rec->rm_owner != info->high.rm_owner)
+		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+	if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
+	    !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
+	    rec->rm_offset + rec->rm_blockcount - 1 != info->high.rm_offset)
+		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+
+	*info->irec = *rec;
+	*info->stat = 1;
+	return XFS_BTREE_QUERY_RANGE_ABORT;
+}
+
+/*
+ * Find the record to the left of the given extent, being careful only to
+ * return a match with the same owner and adjacent physical and logical
+ * block ranges.
+ */
+int
+xfs_rmap_find_left_neighbor(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	uint64_t		owner,
+	uint64_t		offset,
+	unsigned int		flags,
+	struct xfs_rmap_irec	*irec,
+	int			*stat)
+{
+	struct xfs_find_left_neighbor_info	info;
+	int			error;
+
+	*stat = 0;
+	if (bno == 0)
+		return 0;
+	info.high.rm_startblock = bno - 1;
+	info.high.rm_owner = owner;
+	if (!XFS_RMAP_NON_INODE_OWNER(owner) &&
+	    !(flags & XFS_RMAP_BMBT_BLOCK)) {
+		if (offset == 0)
+			return 0;
+		info.high.rm_offset = offset - 1;
+	} else
+		info.high.rm_offset = 0;
+	info.high.rm_flags = flags;
+	info.high.rm_blockcount = 0;
+	info.irec = irec;
+	info.stat = stat;
+
+	trace_xfs_rmap_find_left_neighbor_query(cur->bc_mp,
+			cur->bc_private.a.agno, bno, 0, owner, offset, flags);
+
+	error = xfs_rmap_query_range(cur, &info.high, &info.high,
+			xfs_rmap_find_left_neighbor_helper, &info);
+	if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+		error = 0;
+	if (*stat)
+		trace_xfs_rmap_find_left_neighbor_result(cur->bc_mp,
+				cur->bc_private.a.agno, irec->rm_startblock,
+				irec->rm_blockcount, irec->rm_owner,
+				irec->rm_offset, irec->rm_flags);
+	return error;
+}
+
+/* For each rmap given, figure out if it matches the key we want. */
+STATIC int
+xfs_rmap_lookup_le_range_helper(
+	struct xfs_btree_cur	*cur,
+	struct xfs_rmap_irec	*rec,
+	void			*priv)
+{
+	struct xfs_find_left_neighbor_info	*info = priv;
+
+	trace_xfs_rmap_lookup_le_range_candidate(cur->bc_mp,
+			cur->bc_private.a.agno, rec->rm_startblock,
+			rec->rm_blockcount, rec->rm_owner, rec->rm_offset,
+			rec->rm_flags);
+
+	if (rec->rm_owner != info->high.rm_owner)
+		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+	if (!XFS_RMAP_NON_INODE_OWNER(rec->rm_owner) &&
+	    !(rec->rm_flags & XFS_RMAP_BMBT_BLOCK) &&
+	    (rec->rm_offset > info->high.rm_offset ||
+	     rec->rm_offset + rec->rm_blockcount <= info->high.rm_offset))
+		return XFS_BTREE_QUERY_RANGE_CONTINUE;
+
+	*info->irec = *rec;
+	*info->stat = 1;
+	return XFS_BTREE_QUERY_RANGE_ABORT;
+}
+
+/*
+ * Find the record to the left of the given extent, being careful only to
+ * return a match with the same owner and overlapping physical and logical
+ * block ranges.  This is the overlapping-interval version of
+ * xfs_rmap_lookup_le.
+ */
+int
+xfs_rmap_lookup_le_range(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	uint64_t		owner,
+	uint64_t		offset,
+	unsigned int		flags,
+	struct xfs_rmap_irec	*irec,
+	int			*stat)
+{
+	struct xfs_find_left_neighbor_info	info;
+	int			error;
+
+	info.high.rm_startblock = bno;
+	info.high.rm_owner = owner;
+	if (!XFS_RMAP_NON_INODE_OWNER(owner) && !(flags & XFS_RMAP_BMBT_BLOCK))
+		info.high.rm_offset = offset;
+	else
+		info.high.rm_offset = 0;
+	info.high.rm_flags = flags;
+	info.high.rm_blockcount = 0;
+	*stat = 0;
+	info.irec = irec;
+	info.stat = stat;
+
+	trace_xfs_rmap_lookup_le_range(cur->bc_mp,
+			cur->bc_private.a.agno, bno, 0, owner, offset, flags);
+	error = xfs_rmap_query_range(cur, &info.high, &info.high,
+			xfs_rmap_lookup_le_range_helper, &info);
+	if (error == XFS_BTREE_QUERY_RANGE_ABORT)
+		error = 0;
+	if (*stat)
+		trace_xfs_rmap_lookup_le_range_result(cur->bc_mp,
+				cur->bc_private.a.agno, irec->rm_startblock,
+				irec->rm_blockcount, irec->rm_owner,
+				irec->rm_offset, irec->rm_flags);
+	return error;
+}
+
 /*
  * Find the extent in the rmap btree and remove it.
  *
@@ -1093,11 +1278,704 @@ done:
 	return error;
 }
 
+/*
+ * Convert an unwritten extent to a real extent or vice versa.  If there is no
+ * possibility of overlapping extents, delegate to the simpler convert
+ * function.
+ */
+STATIC int
+xfs_rmap_convert_shared(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len,
+	bool			unwritten,
+	struct xfs_owner_info	*oinfo)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_rmap_irec	r[4];	/* neighbor extent entries */
+					/* left is 0, right is 1, prev is 2 */
+					/* new is 3 */
+	uint64_t		owner;
+	uint64_t		offset;
+	uint64_t		new_endoff;
+	unsigned int		oldext;
+	unsigned int		newext;
+	unsigned int		flags = 0;
+	int			i;
+	int			state = 0;
+	int			error;
+
+	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+	ASSERT(!(XFS_RMAP_NON_INODE_OWNER(owner) ||
+			(flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))));
+	oldext = unwritten ? XFS_RMAP_UNWRITTEN : 0;
+	new_endoff = offset + len;
+	trace_xfs_rmap_convert(mp, cur->bc_private.a.agno, bno, len,
+			unwritten, oinfo);
+
+	/*
+	 * For the initial lookup, look for and exact match or the left-adjacent
+	 * record for our insertion point. This will also give us the record for
+	 * start block contiguity tests.
+	 */
+	error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags,
+			&PREV, &i);
+	XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+
+	ASSERT(PREV.rm_offset <= offset);
+	ASSERT(PREV.rm_offset + PREV.rm_blockcount >= new_endoff);
+	ASSERT((PREV.rm_flags & XFS_RMAP_UNWRITTEN) == oldext);
+	newext = ~oldext & XFS_RMAP_UNWRITTEN;
+
+	/*
+	 * Set flags determining what part of the previous oldext allocation
+	 * extent is being replaced by a newext allocation.
+	 */
+	if (PREV.rm_offset == offset)
+		state |= RMAP_LEFT_FILLING;
+	if (PREV.rm_offset + PREV.rm_blockcount == new_endoff)
+		state |= RMAP_RIGHT_FILLING;
+
+	/* Is there a left record that abuts our range? */
+	error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, newext,
+			&LEFT, &i);
+	if (error)
+		goto done;
+	if (i) {
+		state |= RMAP_LEFT_VALID;
+		XFS_WANT_CORRUPTED_GOTO(mp,
+				LEFT.rm_startblock + LEFT.rm_blockcount <= bno,
+				done);
+		if (xfs_rmap_is_mergeable(&LEFT, owner, newext))
+			state |= RMAP_LEFT_CONTIG;
+	}
+
+	/* Is there a right record that abuts our range? */
+	error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len,
+			newext, &i);
+	if (error)
+		goto done;
+	if (i) {
+		state |= RMAP_RIGHT_VALID;
+		error = xfs_rmap_get_rec(cur, &RIGHT, &i);
+		if (error)
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		XFS_WANT_CORRUPTED_GOTO(mp, bno + len <= RIGHT.rm_startblock,
+				done);
+		trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+				cur->bc_private.a.agno, RIGHT.rm_startblock,
+				RIGHT.rm_blockcount, RIGHT.rm_owner,
+				RIGHT.rm_offset, RIGHT.rm_flags);
+		if (xfs_rmap_is_mergeable(&RIGHT, owner, newext))
+			state |= RMAP_RIGHT_CONTIG;
+	}
+
+	/* check that left + prev + right is not too long */
+	if ((state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+			 RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) ==
+	    (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+	     RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG) &&
+	    (unsigned long)LEFT.rm_blockcount + len +
+	     RIGHT.rm_blockcount > XFS_RMAP_LEN_MAX)
+		state &= ~RMAP_RIGHT_CONTIG;
+
+	trace_xfs_rmap_convert_state(mp, cur->bc_private.a.agno, state,
+			_RET_IP_);
+	/*
+	 * Switch out based on the FILLING and CONTIG state bits.
+	 */
+	switch (state & (RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+			 RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG)) {
+	case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG |
+	     RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The left and right neighbors are both contiguous with new.
+		 */
+		error = xfs_rmap_delete(cur, RIGHT.rm_startblock,
+				RIGHT.rm_blockcount, RIGHT.rm_owner,
+				RIGHT.rm_offset, RIGHT.rm_flags);
+		if (error)
+			goto done;
+		error = xfs_rmap_delete(cur, PREV.rm_startblock,
+				PREV.rm_blockcount, PREV.rm_owner,
+				PREV.rm_offset, PREV.rm_flags);
+		if (error)
+			goto done;
+		NEW = LEFT;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		NEW.rm_blockcount += PREV.rm_blockcount + RIGHT.rm_blockcount;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The left neighbor is contiguous, the right is not.
+		 */
+		error = xfs_rmap_delete(cur, PREV.rm_startblock,
+				PREV.rm_blockcount, PREV.rm_owner,
+				PREV.rm_offset, PREV.rm_flags);
+		if (error)
+			goto done;
+		NEW = LEFT;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		NEW.rm_blockcount += PREV.rm_blockcount;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * The right neighbor is contiguous, the left is not.
+		 */
+		error = xfs_rmap_delete(cur, RIGHT.rm_startblock,
+				RIGHT.rm_blockcount, RIGHT.rm_owner,
+				RIGHT.rm_offset, RIGHT.rm_flags);
+		if (error)
+			goto done;
+		NEW = PREV;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		NEW.rm_blockcount += RIGHT.rm_blockcount;
+		NEW.rm_flags = RIGHT.rm_flags;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING | RMAP_RIGHT_FILLING:
+		/*
+		 * Setting all of a previous oldext extent to newext.
+		 * Neither the left nor right neighbors are contiguous with
+		 * the new one.
+		 */
+		NEW = PREV;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		NEW.rm_flags = newext;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG:
+		/*
+		 * Setting the first part of a previous oldext extent to newext.
+		 * The left neighbor is contiguous.
+		 */
+		NEW = PREV;
+		error = xfs_rmap_delete(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags);
+		if (error)
+			goto done;
+		NEW.rm_offset += len;
+		NEW.rm_startblock += len;
+		NEW.rm_blockcount -= len;
+		error = xfs_rmap_insert(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags);
+		if (error)
+			goto done;
+		NEW = LEFT;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		NEW.rm_blockcount += len;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING:
+		/*
+		 * Setting the first part of a previous oldext extent to newext.
+		 * The left neighbor is not contiguous.
+		 */
+		NEW = PREV;
+		error = xfs_rmap_delete(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags);
+		if (error)
+			goto done;
+		NEW.rm_offset += len;
+		NEW.rm_startblock += len;
+		NEW.rm_blockcount -= len;
+		error = xfs_rmap_insert(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags);
+		if (error)
+			goto done;
+		error = xfs_rmap_insert(cur, bno, len, owner, offset, newext);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_RIGHT_FILLING | RMAP_RIGHT_CONTIG:
+		/*
+		 * Setting the last part of a previous oldext extent to newext.
+		 * The right neighbor is contiguous with the new allocation.
+		 */
+		NEW = PREV;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		NEW.rm_blockcount = offset - NEW.rm_offset;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		NEW = RIGHT;
+		error = xfs_rmap_delete(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags);
+		if (error)
+			goto done;
+		NEW.rm_offset = offset;
+		NEW.rm_startblock = bno;
+		NEW.rm_blockcount += len;
+		error = xfs_rmap_insert(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_RIGHT_FILLING:
+		/*
+		 * Setting the last part of a previous oldext extent to newext.
+		 * The right neighbor is not contiguous.
+		 */
+		NEW = PREV;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		NEW.rm_blockcount -= len;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		error = xfs_rmap_insert(cur, bno, len, owner, offset, newext);
+		if (error)
+			goto done;
+		break;
+
+	case 0:
+		/*
+		 * Setting the middle part of a previous oldext extent to
+		 * newext.  Contiguity is impossible here.
+		 * One extent becomes three extents.
+		 */
+		/* new right extent - oldext */
+		NEW.rm_startblock = bno + len;
+		NEW.rm_owner = owner;
+		NEW.rm_offset = new_endoff;
+		NEW.rm_blockcount = PREV.rm_offset + PREV.rm_blockcount -
+				new_endoff;
+		NEW.rm_flags = PREV.rm_flags;
+		error = xfs_rmap_insert(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset,
+				NEW.rm_flags);
+		if (error)
+			goto done;
+		/* new left extent - oldext */
+		NEW = PREV;
+		error = xfs_rmap_lookup_eq(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner,
+				NEW.rm_offset, NEW.rm_flags, &i);
+		if (error)
+			goto done;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, done);
+		NEW.rm_blockcount = offset - NEW.rm_offset;
+		error = xfs_rmap_update(cur, &NEW);
+		if (error)
+			goto done;
+		/* new middle extent - newext */
+		NEW.rm_startblock = bno;
+		NEW.rm_blockcount = len;
+		NEW.rm_owner = owner;
+		NEW.rm_offset = offset;
+		NEW.rm_flags = newext;
+		error = xfs_rmap_insert(cur, NEW.rm_startblock,
+				NEW.rm_blockcount, NEW.rm_owner, NEW.rm_offset,
+				NEW.rm_flags);
+		if (error)
+			goto done;
+		break;
+
+	case RMAP_LEFT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+	case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+	case RMAP_LEFT_FILLING | RMAP_RIGHT_CONTIG:
+	case RMAP_RIGHT_FILLING | RMAP_LEFT_CONTIG:
+	case RMAP_LEFT_CONTIG | RMAP_RIGHT_CONTIG:
+	case RMAP_LEFT_CONTIG:
+	case RMAP_RIGHT_CONTIG:
+		/*
+		 * These cases are all impossible.
+		 */
+		ASSERT(0);
+	}
+
+	trace_xfs_rmap_convert_done(mp, cur->bc_private.a.agno, bno, len,
+			unwritten, oinfo);
+done:
+	if (error)
+		trace_xfs_rmap_convert_error(cur->bc_mp,
+				cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
 #undef	NEW
 #undef	LEFT
 #undef	RIGHT
 #undef	PREV
 
+/*
+ * Find an extent in the rmap btree and unmap it.  For rmap extent types that
+ * can overlap (data fork rmaps on reflink filesystems) we must be careful
+ * that the prev/next records in the btree might belong to another owner.
+ * Therefore we must use delete+insert to alter any of the key fields.
+ *
+ * For every other situation there can only be one owner for a given extent,
+ * so we can call the regular _free function.
+ */
+STATIC int
+xfs_rmap_unmap_shared(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len,
+	bool			unwritten,
+	struct xfs_owner_info	*oinfo)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_rmap_irec	ltrec;
+	uint64_t		ltoff;
+	int			error = 0;
+	int			i;
+	uint64_t		owner;
+	uint64_t		offset;
+	unsigned int		flags;
+
+	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+	if (unwritten)
+		flags |= XFS_RMAP_UNWRITTEN;
+	trace_xfs_rmap_unmap(mp, cur->bc_private.a.agno, bno, len,
+			unwritten, oinfo);
+
+	/*
+	 * We should always have a left record because there's a static record
+	 * for the AG headers at rm_startblock == 0 created by mkfs/growfs that
+	 * will not ever be removed from the tree.
+	 */
+	error = xfs_rmap_lookup_le_range(cur, bno, owner, offset, flags,
+			&ltrec, &i);
+	if (error)
+		goto out_error;
+	XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+	ltoff = ltrec.rm_offset;
+
+	/* Make sure the extent we found covers the entire freeing range. */
+	XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_startblock <= bno &&
+		ltrec.rm_startblock + ltrec.rm_blockcount >=
+		bno + len, out_error);
+
+	/* Make sure the owner matches what we expect to find in the tree. */
+	XFS_WANT_CORRUPTED_GOTO(mp, owner == ltrec.rm_owner, out_error);
+
+	/* Make sure the unwritten flag matches. */
+	XFS_WANT_CORRUPTED_GOTO(mp, (flags & XFS_RMAP_UNWRITTEN) ==
+			(ltrec.rm_flags & XFS_RMAP_UNWRITTEN), out_error);
+
+	/* Check the offset. */
+	XFS_WANT_CORRUPTED_GOTO(mp, ltrec.rm_offset <= offset, out_error);
+	XFS_WANT_CORRUPTED_GOTO(mp, offset <= ltoff + ltrec.rm_blockcount,
+			out_error);
+
+	if (ltrec.rm_startblock == bno && ltrec.rm_blockcount == len) {
+		/* Exact match, simply remove the record from rmap tree. */
+		error = xfs_rmap_delete(cur, ltrec.rm_startblock,
+				ltrec.rm_blockcount, ltrec.rm_owner,
+				ltrec.rm_offset, ltrec.rm_flags);
+		if (error)
+			goto out_error;
+	} else if (ltrec.rm_startblock == bno) {
+		/*
+		 * Overlap left hand side of extent: move the start, trim the
+		 * length and update the current record.
+		 *
+		 *       ltbno                ltlen
+		 * Orig:    |oooooooooooooooooooo|
+		 * Freeing: |fffffffff|
+		 * Result:            |rrrrrrrrrr|
+		 *         bno       len
+		 */
+
+		/* Delete prev rmap. */
+		error = xfs_rmap_delete(cur, ltrec.rm_startblock,
+				ltrec.rm_blockcount, ltrec.rm_owner,
+				ltrec.rm_offset, ltrec.rm_flags);
+		if (error)
+			goto out_error;
+
+		/* Add an rmap at the new offset. */
+		ltrec.rm_startblock += len;
+		ltrec.rm_blockcount -= len;
+		ltrec.rm_offset += len;
+		error = xfs_rmap_insert(cur, ltrec.rm_startblock,
+				ltrec.rm_blockcount, ltrec.rm_owner,
+				ltrec.rm_offset, ltrec.rm_flags);
+		if (error)
+			goto out_error;
+	} else if (ltrec.rm_startblock + ltrec.rm_blockcount == bno + len) {
+		/*
+		 * Overlap right hand side of extent: trim the length and
+		 * update the current record.
+		 *
+		 *       ltbno                ltlen
+		 * Orig:    |oooooooooooooooooooo|
+		 * Freeing:            |fffffffff|
+		 * Result:  |rrrrrrrrrr|
+		 *                    bno       len
+		 */
+		error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock,
+				ltrec.rm_blockcount, ltrec.rm_owner,
+				ltrec.rm_offset, ltrec.rm_flags, &i);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+		ltrec.rm_blockcount -= len;
+		error = xfs_rmap_update(cur, &ltrec);
+		if (error)
+			goto out_error;
+	} else {
+		/*
+		 * Overlap middle of extent: trim the length of the existing
+		 * record to the length of the new left-extent size, increment
+		 * the insertion position so we can insert a new record
+		 * containing the remaining right-extent space.
+		 *
+		 *       ltbno                ltlen
+		 * Orig:    |oooooooooooooooooooo|
+		 * Freeing:       |fffffffff|
+		 * Result:  |rrrrr|         |rrrr|
+		 *               bno       len
+		 */
+		xfs_extlen_t	orig_len = ltrec.rm_blockcount;
+
+		/* Shrink the left side of the rmap */
+		error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock,
+				ltrec.rm_blockcount, ltrec.rm_owner,
+				ltrec.rm_offset, ltrec.rm_flags, &i);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+		ltrec.rm_blockcount = bno - ltrec.rm_startblock;
+		error = xfs_rmap_update(cur, &ltrec);
+		if (error)
+			goto out_error;
+
+		/* Add an rmap at the new offset */
+		error = xfs_rmap_insert(cur, bno + len,
+				orig_len - len - ltrec.rm_blockcount,
+				ltrec.rm_owner, offset + len,
+				ltrec.rm_flags);
+		if (error)
+			goto out_error;
+	}
+
+	trace_xfs_rmap_unmap_done(mp, cur->bc_private.a.agno, bno, len,
+			unwritten, oinfo);
+out_error:
+	if (error)
+		trace_xfs_rmap_unmap_error(cur->bc_mp,
+				cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Find an extent in the rmap btree and map it.  For rmap extent types that
+ * can overlap (data fork rmaps on reflink filesystems) we must be careful
+ * that the prev/next records in the btree might belong to another owner.
+ * Therefore we must use delete+insert to alter any of the key fields.
+ *
+ * For every other situation there can only be one owner for a given extent,
+ * so we can call the regular _alloc function.
+ */
+STATIC int
+xfs_rmap_map_shared(
+	struct xfs_btree_cur	*cur,
+	xfs_agblock_t		bno,
+	xfs_extlen_t		len,
+	bool			unwritten,
+	struct xfs_owner_info	*oinfo)
+{
+	struct xfs_mount	*mp = cur->bc_mp;
+	struct xfs_rmap_irec	ltrec;
+	struct xfs_rmap_irec	gtrec;
+	int			have_gt;
+	int			have_lt;
+	int			error = 0;
+	int			i;
+	uint64_t		owner;
+	uint64_t		offset;
+	unsigned int		flags = 0;
+
+	xfs_owner_info_unpack(oinfo, &owner, &offset, &flags);
+	if (unwritten)
+		flags |= XFS_RMAP_UNWRITTEN;
+	trace_xfs_rmap_map(mp, cur->bc_private.a.agno, bno, len,
+			unwritten, oinfo);
+
+	/* Is there a left record that abuts our range? */
+	error = xfs_rmap_find_left_neighbor(cur, bno, owner, offset, flags,
+			&ltrec, &have_lt);
+	if (error)
+		goto out_error;
+	if (have_lt &&
+	    !xfs_rmap_is_mergeable(&ltrec, owner, flags))
+		have_lt = 0;
+
+	/* Is there a right record that abuts our range? */
+	error = xfs_rmap_lookup_eq(cur, bno + len, len, owner, offset + len,
+			flags, &have_gt);
+	if (error)
+		goto out_error;
+	if (have_gt) {
+		error = xfs_rmap_get_rec(cur, &gtrec, &have_gt);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(mp, have_gt == 1, out_error);
+		trace_xfs_rmap_find_right_neighbor_result(cur->bc_mp,
+			cur->bc_private.a.agno, gtrec.rm_startblock,
+			gtrec.rm_blockcount, gtrec.rm_owner,
+			gtrec.rm_offset, gtrec.rm_flags);
+
+		if (!xfs_rmap_is_mergeable(&gtrec, owner, flags))
+			have_gt = 0;
+	}
+
+	if (have_lt &&
+	    ltrec.rm_startblock + ltrec.rm_blockcount == bno &&
+	    ltrec.rm_offset + ltrec.rm_blockcount == offset) {
+		/*
+		 * Left edge contiguous, merge into left record.
+		 *
+		 *       ltbno     ltlen
+		 * orig:   |ooooooooo|
+		 * adding:           |aaaaaaaaa|
+		 * result: |rrrrrrrrrrrrrrrrrrr|
+		 *                  bno       len
+		 */
+		ltrec.rm_blockcount += len;
+		if (have_gt &&
+		    bno + len == gtrec.rm_startblock &&
+		    offset + len == gtrec.rm_offset) {
+			/*
+			 * Right edge also contiguous, delete right record
+			 * and merge into left record.
+			 *
+			 *       ltbno     ltlen    gtbno     gtlen
+			 * orig:   |ooooooooo|         |ooooooooo|
+			 * adding:           |aaaaaaaaa|
+			 * result: |rrrrrrrrrrrrrrrrrrrrrrrrrrrrr|
+			 */
+			ltrec.rm_blockcount += gtrec.rm_blockcount;
+			error = xfs_rmap_delete(cur, gtrec.rm_startblock,
+					gtrec.rm_blockcount, gtrec.rm_owner,
+					gtrec.rm_offset, gtrec.rm_flags);
+			if (error)
+				goto out_error;
+		}
+
+		/* Point the cursor back to the left record and update. */
+		error = xfs_rmap_lookup_eq(cur, ltrec.rm_startblock,
+				ltrec.rm_blockcount, ltrec.rm_owner,
+				ltrec.rm_offset, ltrec.rm_flags, &i);
+		if (error)
+			goto out_error;
+		XFS_WANT_CORRUPTED_GOTO(mp, i == 1, out_error);
+
+		error = xfs_rmap_update(cur, &ltrec);
+		if (error)
+			goto out_error;
+	} else if (have_gt &&
+		   bno + len == gtrec.rm_startblock &&
+		   offset + len == gtrec.rm_offset) {
+		/*
+		 * Right edge contiguous, merge into right record.
+		 *
+		 *                 gtbno     gtlen
+		 * Orig:             |ooooooooo|
+		 * adding: |aaaaaaaaa|
+		 * Result: |rrrrrrrrrrrrrrrrrrr|
+		 *        bno       len
+		 */
+		/* Delete the old record. */
+		error = xfs_rmap_delete(cur, gtrec.rm_startblock,
+				gtrec.rm_blockcount, gtrec.rm_owner,
+				gtrec.rm_offset, gtrec.rm_flags);
+		if (error)
+			goto out_error;
+
+		/* Move the start and re-add it. */
+		gtrec.rm_startblock = bno;
+		gtrec.rm_blockcount += len;
+		gtrec.rm_offset = offset;
+		error = xfs_rmap_insert(cur, gtrec.rm_startblock,
+				gtrec.rm_blockcount, gtrec.rm_owner,
+				gtrec.rm_offset, gtrec.rm_flags);
+		if (error)
+			goto out_error;
+	} else {
+		/*
+		 * No contiguous edge with identical owner, insert
+		 * new record at current cursor position.
+		 */
+		error = xfs_rmap_insert(cur, bno, len, owner, offset, flags);
+		if (error)
+			goto out_error;
+	}
+
+	trace_xfs_rmap_map_done(mp, cur->bc_private.a.agno, bno, len,
+			unwritten, oinfo);
+out_error:
+	if (error)
+		trace_xfs_rmap_map_error(cur->bc_mp,
+				cur->bc_private.a.agno, error, _RET_IP_);
+	return error;
+}
+
 struct xfs_rmap_query_range_info {
 	xfs_rmap_query_range_fn	fn;
 	void				*priv;
@@ -1237,15 +2115,27 @@ xfs_rmap_finish_one(
 	case XFS_RMAP_MAP:
 		error = xfs_rmap_map(rcur, bno, blockcount, unwritten, &oinfo);
 		break;
+	case XFS_RMAP_MAP_SHARED:
+		error = xfs_rmap_map_shared(rcur, bno, blockcount, unwritten,
+				&oinfo);
+		break;
 	case XFS_RMAP_FREE:
 	case XFS_RMAP_UNMAP:
 		error = xfs_rmap_unmap(rcur, bno, blockcount, unwritten,
 				&oinfo);
 		break;
+	case XFS_RMAP_UNMAP_SHARED:
+		error = xfs_rmap_unmap_shared(rcur, bno, blockcount, unwritten,
+				&oinfo);
+		break;
 	case XFS_RMAP_CONVERT:
 		error = xfs_rmap_convert(rcur, bno, blockcount, !unwritten,
 				&oinfo);
 		break;
+	case XFS_RMAP_CONVERT_SHARED:
+		error = xfs_rmap_convert_shared(rcur, bno, blockcount,
+				!unwritten, &oinfo);
+		break;
 	default:
 		ASSERT(0);
 		error = -EFSCORRUPTED;
@@ -1263,9 +2153,10 @@ out_cur:
  */
 static bool
 xfs_rmap_update_is_needed(
-	struct xfs_mount	*mp)
+	struct xfs_mount	*mp,
+	int			whichfork)
 {
-	return xfs_sb_version_hasrmapbt(&mp->m_sb);
+	return xfs_sb_version_hasrmapbt(&mp->m_sb) && whichfork != XFS_COW_FORK;
 }
 
 /*
@@ -1311,10 +2202,11 @@ xfs_rmap_map_extent(
 	int			whichfork,
 	struct xfs_bmbt_irec	*PREV)
 {
-	if (!xfs_rmap_update_is_needed(mp))
+	if (!xfs_rmap_update_is_needed(mp, whichfork))
 		return 0;
 
-	return __xfs_rmap_add(mp, dfops, XFS_RMAP_MAP, ip->i_ino,
+	return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ?
+			XFS_RMAP_MAP_SHARED : XFS_RMAP_MAP, ip->i_ino,
 			whichfork, PREV);
 }
 
@@ -1327,10 +2219,11 @@ xfs_rmap_unmap_extent(
 	int			whichfork,
 	struct xfs_bmbt_irec	*PREV)
 {
-	if (!xfs_rmap_update_is_needed(mp))
+	if (!xfs_rmap_update_is_needed(mp, whichfork))
 		return 0;
 
-	return __xfs_rmap_add(mp, dfops, XFS_RMAP_UNMAP, ip->i_ino,
+	return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ?
+			XFS_RMAP_UNMAP_SHARED : XFS_RMAP_UNMAP, ip->i_ino,
 			whichfork, PREV);
 }
 
@@ -1343,10 +2236,11 @@ xfs_rmap_convert_extent(
 	int			whichfork,
 	struct xfs_bmbt_irec	*PREV)
 {
-	if (!xfs_rmap_update_is_needed(mp))
+	if (!xfs_rmap_update_is_needed(mp, whichfork))
 		return 0;
 
-	return __xfs_rmap_add(mp, dfops, XFS_RMAP_CONVERT, ip->i_ino,
+	return __xfs_rmap_add(mp, dfops, xfs_is_reflink_inode(ip) ?
+			XFS_RMAP_CONVERT_SHARED : XFS_RMAP_CONVERT, ip->i_ino,
 			whichfork, PREV);
 }
 
@@ -1362,7 +2256,7 @@ xfs_rmap_alloc_extent(
 {
 	struct xfs_bmbt_irec	bmap;
 
-	if (!xfs_rmap_update_is_needed(mp))
+	if (!xfs_rmap_update_is_needed(mp, XFS_DATA_FORK))
 		return 0;
 
 	bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno);
@@ -1386,7 +2280,7 @@ xfs_rmap_free_extent(
 {
 	struct xfs_bmbt_irec	bmap;
 
-	if (!xfs_rmap_update_is_needed(mp))
+	if (!xfs_rmap_update_is_needed(mp, XFS_DATA_FORK))
 		return 0;
 
 	bmap.br_startblock = XFS_AGB_TO_FSB(mp, agno, bno);
diff --git a/fs/xfs/libxfs/xfs_rmap.h b/fs/xfs/libxfs/xfs_rmap.h
index 71cf99a4acba..789930599339 100644
--- a/fs/xfs/libxfs/xfs_rmap.h
+++ b/fs/xfs/libxfs/xfs_rmap.h
@@ -206,4 +206,11 @@ int xfs_rmap_finish_one(struct xfs_trans *tp, enum xfs_rmap_intent_type type,
 		xfs_fsblock_t startblock, xfs_filblks_t blockcount,
 		xfs_exntst_t state, struct xfs_btree_cur **pcur);
 
+int xfs_rmap_find_left_neighbor(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+		uint64_t owner, uint64_t offset, unsigned int flags,
+		struct xfs_rmap_irec *irec, int	*stat);
+int xfs_rmap_lookup_le_range(struct xfs_btree_cur *cur, xfs_agblock_t bno,
+		uint64_t owner, uint64_t offset, unsigned int flags,
+		struct xfs_rmap_irec *irec, int	*stat);
+
 #endif	/* __XFS_RMAP_H__ */
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.c b/fs/xfs/libxfs/xfs_rmap_btree.c
index 17b8eeb34ac8..83e672ff7577 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.c
+++ b/fs/xfs/libxfs/xfs_rmap_btree.c
@@ -35,6 +35,7 @@
 #include "xfs_cksum.h"
 #include "xfs_error.h"
 #include "xfs_extent_busy.h"
+#include "xfs_ag_resv.h"
 
 /*
  * Reverse map btree.
@@ -512,6 +513,83 @@ void
 xfs_rmapbt_compute_maxlevels(
 	struct xfs_mount		*mp)
 {
-	mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp,
-			mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
+	/*
+	 * On a non-reflink filesystem, the maximum number of rmap
+	 * records is the number of blocks in the AG, hence the max
+	 * rmapbt height is log_$maxrecs($agblocks).  However, with
+	 * reflink each AG block can have up to 2^32 (per the refcount
+	 * record format) owners, which means that theoretically we
+	 * could face up to 2^64 rmap records.
+	 *
+	 * That effectively means that the max rmapbt height must be
+	 * XFS_BTREE_MAXLEVELS.  "Fortunately" we'll run out of AG
+	 * blocks to feed the rmapbt long before the rmapbt reaches
+	 * maximum height.  The reflink code uses ag_resv_critical to
+	 * disallow reflinking when less than 10% of the per-AG metadata
+	 * block reservation since the fallback is a regular file copy.
+	 */
+	if (xfs_sb_version_hasreflink(&mp->m_sb))
+		mp->m_rmap_maxlevels = XFS_BTREE_MAXLEVELS;
+	else
+		mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(mp,
+				mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
+}
+
+/* Calculate the refcount btree size for some records. */
+xfs_extlen_t
+xfs_rmapbt_calc_size(
+	struct xfs_mount	*mp,
+	unsigned long long	len)
+{
+	return xfs_btree_calc_size(mp, mp->m_rmap_mnr, len);
+}
+
+/*
+ * Calculate the maximum refcount btree size.
+ */
+xfs_extlen_t
+xfs_rmapbt_max_size(
+	struct xfs_mount	*mp)
+{
+	/* Bail out if we're uninitialized, which can happen in mkfs. */
+	if (mp->m_rmap_mxr[0] == 0)
+		return 0;
+
+	return xfs_rmapbt_calc_size(mp, mp->m_sb.sb_agblocks);
+}
+
+/*
+ * Figure out how many blocks to reserve and how many are used by this btree.
+ */
+int
+xfs_rmapbt_calc_reserves(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_extlen_t		*ask,
+	xfs_extlen_t		*used)
+{
+	struct xfs_buf		*agbp;
+	struct xfs_agf		*agf;
+	xfs_extlen_t		pool_len;
+	xfs_extlen_t		tree_len;
+	int			error;
+
+	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+		return 0;
+
+	/* Reserve 1% of the AG or enough for 1 block per record. */
+	pool_len = max(mp->m_sb.sb_agblocks / 100, xfs_rmapbt_max_size(mp));
+	*ask += pool_len;
+
+	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+	if (error)
+		return error;
+
+	agf = XFS_BUF_TO_AGF(agbp);
+	tree_len = be32_to_cpu(agf->agf_rmap_blocks);
+	xfs_buf_relse(agbp);
+
+	*used += tree_len;
+
+	return error;
 }
diff --git a/fs/xfs/libxfs/xfs_rmap_btree.h b/fs/xfs/libxfs/xfs_rmap_btree.h
index e73a55357dab..2a9ac472fb15 100644
--- a/fs/xfs/libxfs/xfs_rmap_btree.h
+++ b/fs/xfs/libxfs/xfs_rmap_btree.h
@@ -58,4 +58,11 @@ struct xfs_btree_cur *xfs_rmapbt_init_cursor(struct xfs_mount *mp,
 int xfs_rmapbt_maxrecs(struct xfs_mount *mp, int blocklen, int leaf);
 extern void xfs_rmapbt_compute_maxlevels(struct xfs_mount *mp);
 
+extern xfs_extlen_t xfs_rmapbt_calc_size(struct xfs_mount *mp,
+		unsigned long long len);
+extern xfs_extlen_t xfs_rmapbt_max_size(struct xfs_mount *mp);
+
+extern int xfs_rmapbt_calc_reserves(struct xfs_mount *mp,
+		xfs_agnumber_t agno, xfs_extlen_t *ask, xfs_extlen_t *used);
+
 #endif	/* __XFS_RMAP_BTREE_H__ */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 4aecc5fefe96..a70aec910626 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -38,6 +38,8 @@
 #include "xfs_ialloc_btree.h"
 #include "xfs_log.h"
 #include "xfs_rmap_btree.h"
+#include "xfs_bmap.h"
+#include "xfs_refcount_btree.h"
 
 /*
  * Physical superblock buffer manipulations. Shared with libxfs in userspace.
@@ -737,6 +739,13 @@ xfs_sb_mount_common(
 	mp->m_rmap_mnr[0] = mp->m_rmap_mxr[0] / 2;
 	mp->m_rmap_mnr[1] = mp->m_rmap_mxr[1] / 2;
 
+	mp->m_refc_mxr[0] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize,
+			true);
+	mp->m_refc_mxr[1] = xfs_refcountbt_maxrecs(mp, sbp->sb_blocksize,
+			false);
+	mp->m_refc_mnr[0] = mp->m_refc_mxr[0] / 2;
+	mp->m_refc_mnr[1] = mp->m_refc_mxr[1] / 2;
+
 	mp->m_bsize = XFS_FSB_TO_BB(mp, 1);
 	mp->m_ialloc_inos = (int)MAX((__uint16_t)XFS_INODES_PER_CHUNK,
 					sbp->sb_inopblock);
diff --git a/fs/xfs/libxfs/xfs_shared.h b/fs/xfs/libxfs/xfs_shared.h
index 0c5b30bd884c..c6f4eb46fe26 100644
--- a/fs/xfs/libxfs/xfs_shared.h
+++ b/fs/xfs/libxfs/xfs_shared.h
@@ -39,6 +39,7 @@ extern const struct xfs_buf_ops xfs_agf_buf_ops;
 extern const struct xfs_buf_ops xfs_agfl_buf_ops;
 extern const struct xfs_buf_ops xfs_allocbt_buf_ops;
 extern const struct xfs_buf_ops xfs_rmapbt_buf_ops;
+extern const struct xfs_buf_ops xfs_refcountbt_buf_ops;
 extern const struct xfs_buf_ops xfs_attr3_leaf_buf_ops;
 extern const struct xfs_buf_ops xfs_attr3_rmt_buf_ops;
 extern const struct xfs_buf_ops xfs_bmbt_buf_ops;
@@ -122,6 +123,7 @@ int	xfs_log_calc_minimum_size(struct xfs_mount *);
 #define	XFS_INO_REF		2
 #define	XFS_ATTR_BTREE_REF	1
 #define	XFS_DQUOT_REF		1
+#define	XFS_REFC_BTREE_REF	1
 
 /*
  * Flags for xfs_trans_ichgtime().
diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c
index 301ef2f4dbd6..b456cca1bfb2 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.c
+++ b/fs/xfs/libxfs/xfs_trans_resv.c
@@ -67,13 +67,14 @@ xfs_calc_buf_res(
  * Per-extent log reservation for the btree changes involved in freeing or
  * allocating an extent.  In classic XFS there were two trees that will be
  * modified (bnobt + cntbt).  With rmap enabled, there are three trees
- * (rmapbt).  The number of blocks reserved is based on the formula:
+ * (rmapbt).  With reflink, there are four trees (refcountbt).  The number of
+ * blocks reserved is based on the formula:
  *
  * num trees * ((2 blocks/level * max depth) - 1)
  *
  * Keep in mind that max depth is calculated separately for each type of tree.
  */
-static uint
+uint
 xfs_allocfree_log_count(
 	struct xfs_mount *mp,
 	uint		num_ops)
@@ -83,6 +84,8 @@ xfs_allocfree_log_count(
 	blocks = num_ops * 2 * (2 * mp->m_ag_maxlevels - 1);
 	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
 		blocks += num_ops * (2 * mp->m_rmap_maxlevels - 1);
+	if (xfs_sb_version_hasreflink(&mp->m_sb))
+		blocks += num_ops * (2 * mp->m_refc_maxlevels - 1);
 
 	return blocks;
 }
@@ -809,11 +812,18 @@ xfs_trans_resv_calc(
 	 * require a permanent reservation on space.
 	 */
 	resp->tr_write.tr_logres = xfs_calc_write_reservation(mp);
-	resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
+	if (xfs_sb_version_hasreflink(&mp->m_sb))
+		resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+	else
+		resp->tr_write.tr_logcount = XFS_WRITE_LOG_COUNT;
 	resp->tr_write.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
 
 	resp->tr_itruncate.tr_logres = xfs_calc_itruncate_reservation(mp);
-	resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
+	if (xfs_sb_version_hasreflink(&mp->m_sb))
+		resp->tr_itruncate.tr_logcount =
+				XFS_ITRUNCATE_LOG_COUNT_REFLINK;
+	else
+		resp->tr_itruncate.tr_logcount = XFS_ITRUNCATE_LOG_COUNT;
 	resp->tr_itruncate.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
 
 	resp->tr_rename.tr_logres = xfs_calc_rename_reservation(mp);
@@ -870,7 +880,10 @@ xfs_trans_resv_calc(
 	resp->tr_growrtalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
 
 	resp->tr_qm_dqalloc.tr_logres = xfs_calc_qm_dqalloc_reservation(mp);
-	resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
+	if (xfs_sb_version_hasreflink(&mp->m_sb))
+		resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT_REFLINK;
+	else
+		resp->tr_qm_dqalloc.tr_logcount = XFS_WRITE_LOG_COUNT;
 	resp->tr_qm_dqalloc.tr_logflags |= XFS_TRANS_PERM_LOG_RES;
 
 	/*
diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h
index 0eb46ed6d404..b7e5357d060a 100644
--- a/fs/xfs/libxfs/xfs_trans_resv.h
+++ b/fs/xfs/libxfs/xfs_trans_resv.h
@@ -87,6 +87,7 @@ struct xfs_trans_resv {
 #define	XFS_DEFAULT_LOG_COUNT		1
 #define	XFS_DEFAULT_PERM_LOG_COUNT	2
 #define	XFS_ITRUNCATE_LOG_COUNT		2
+#define	XFS_ITRUNCATE_LOG_COUNT_REFLINK	8
 #define XFS_INACTIVE_LOG_COUNT		2
 #define	XFS_CREATE_LOG_COUNT		2
 #define	XFS_CREATE_TMPFILE_LOG_COUNT	2
@@ -96,11 +97,13 @@ struct xfs_trans_resv {
 #define	XFS_LINK_LOG_COUNT		2
 #define	XFS_RENAME_LOG_COUNT		2
 #define	XFS_WRITE_LOG_COUNT		2
+#define	XFS_WRITE_LOG_COUNT_REFLINK	8
 #define	XFS_ADDAFORK_LOG_COUNT		2
 #define	XFS_ATTRINVAL_LOG_COUNT		1
 #define	XFS_ATTRSET_LOG_COUNT		3
 #define	XFS_ATTRRM_LOG_COUNT		3
 
 void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp);
+uint xfs_allocfree_log_count(struct xfs_mount *mp, uint num_ops);
 
 #endif	/* __XFS_TRANS_RESV_H__ */
diff --git a/fs/xfs/libxfs/xfs_trans_space.h b/fs/xfs/libxfs/xfs_trans_space.h
index 41e0428d8175..7917f6e44286 100644
--- a/fs/xfs/libxfs/xfs_trans_space.h
+++ b/fs/xfs/libxfs/xfs_trans_space.h
@@ -21,6 +21,8 @@
 /*
  * Components of space reservations.
  */
+#define XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)    \
+		(((mp)->m_rmap_mxr[0]) - ((mp)->m_rmap_mnr[0]))
 #define XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)    \
 		(((mp)->m_alloc_mxr[0]) - ((mp)->m_alloc_mnr[0]))
 #define	XFS_EXTENTADD_SPACE_RES(mp,w)	(XFS_BM_MAXLEVELS(mp,w) - 1)
@@ -28,6 +30,13 @@
 	(((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
 	  XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
 	  XFS_EXTENTADD_SPACE_RES(mp,w))
+#define XFS_SWAP_RMAP_SPACE_RES(mp,b,w)\
+	(((b + XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp) - 1) / \
+	  XFS_MAX_CONTIG_EXTENTS_PER_BLOCK(mp)) * \
+	  XFS_EXTENTADD_SPACE_RES(mp,w) + \
+	 ((b + XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp) - 1) / \
+	  XFS_MAX_CONTIG_RMAPS_PER_BLOCK(mp)) * \
+	  (mp)->m_rmap_maxlevels)
 #define	XFS_DAENTER_1B(mp,w)	\
 	((w) == XFS_DATA_FORK ? (mp)->m_dir_geo->fsbcount : 1)
 #define	XFS_DAENTER_DBS(mp,w)	\
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 3d503647f26b..8d74870468c2 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -90,6 +90,7 @@ typedef __int64_t	xfs_sfiloff_t;	/* signed block number in a file */
  */
 #define	XFS_DATA_FORK	0
 #define	XFS_ATTR_FORK	1
+#define	XFS_COW_FORK	2
 
 /*
  * Min numbers of data/attr fork btree root pointers.
@@ -109,7 +110,7 @@ typedef enum {
 
 typedef enum {
 	XFS_BTNUM_BNOi, XFS_BTNUM_CNTi, XFS_BTNUM_RMAPi, XFS_BTNUM_BMAPi,
-	XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_MAX
+	XFS_BTNUM_INOi, XFS_BTNUM_FINOi, XFS_BTNUM_REFCi, XFS_BTNUM_MAX
 } xfs_btnum_t;
 
 struct xfs_name {
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 4a28fa91e3b1..3e57a56cf829 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -31,6 +31,7 @@
 #include "xfs_bmap.h"
 #include "xfs_bmap_util.h"
 #include "xfs_bmap_btree.h"
+#include "xfs_reflink.h"
 #include <linux/gfp.h>
 #include <linux/mpage.h>
 #include <linux/pagevec.h>
@@ -39,6 +40,7 @@
 /* flags for direct write completions */
 #define XFS_DIO_FLAG_UNWRITTEN	(1 << 0)
 #define XFS_DIO_FLAG_APPEND	(1 << 1)
+#define XFS_DIO_FLAG_COW	(1 << 2)
 
 /*
  * structure owned by writepages passed to individual writepage calls
@@ -287,6 +289,25 @@ xfs_end_io(
 		error = -EIO;
 
 	/*
+	 * For a CoW extent, we need to move the mapping from the CoW fork
+	 * to the data fork.  If instead an error happened, just dump the
+	 * new blocks.
+	 */
+	if (ioend->io_type == XFS_IO_COW) {
+		if (error)
+			goto done;
+		if (ioend->io_bio->bi_error) {
+			error = xfs_reflink_cancel_cow_range(ip,
+					ioend->io_offset, ioend->io_size);
+			goto done;
+		}
+		error = xfs_reflink_end_cow(ip, ioend->io_offset,
+				ioend->io_size);
+		if (error)
+			goto done;
+	}
+
+	/*
 	 * For unwritten extents we need to issue transactions to convert a
 	 * range to normal written extens after the data I/O has finished.
 	 * Detecting and handling completion IO errors is done individually
@@ -301,7 +322,8 @@ xfs_end_io(
 	} else if (ioend->io_append_trans) {
 		error = xfs_setfilesize_ioend(ioend, error);
 	} else {
-		ASSERT(!xfs_ioend_is_append(ioend));
+		ASSERT(!xfs_ioend_is_append(ioend) ||
+		       ioend->io_type == XFS_IO_COW);
 	}
 
 done:
@@ -315,7 +337,7 @@ xfs_end_bio(
 	struct xfs_ioend	*ioend = bio->bi_private;
 	struct xfs_mount	*mp = XFS_I(ioend->io_inode)->i_mount;
 
-	if (ioend->io_type == XFS_IO_UNWRITTEN)
+	if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
 		queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
 	else if (ioend->io_append_trans)
 		queue_work(mp->m_data_workqueue, &ioend->io_work);
@@ -341,6 +363,7 @@ xfs_map_blocks(
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
+	ASSERT(type != XFS_IO_COW);
 	if (type == XFS_IO_UNWRITTEN)
 		bmapi_flags |= XFS_BMAPI_IGSTATE;
 
@@ -355,6 +378,13 @@ xfs_map_blocks(
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
 				imap, &nimaps, bmapi_flags);
+	/*
+	 * Truncate an overwrite extent if there's a pending CoW
+	 * reservation before the end of this extent.  This forces us
+	 * to come back to writepage to take care of the CoW.
+	 */
+	if (nimaps && type == XFS_IO_OVERWRITE)
+		xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, imap);
 	xfs_iunlock(ip, XFS_ILOCK_SHARED);
 
 	if (error)
@@ -362,7 +392,8 @@ xfs_map_blocks(
 
 	if (type == XFS_IO_DELALLOC &&
 	    (!nimaps || isnullstartblock(imap->br_startblock))) {
-		error = xfs_iomap_write_allocate(ip, offset, imap);
+		error = xfs_iomap_write_allocate(ip, XFS_DATA_FORK, offset,
+				imap);
 		if (!error)
 			trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
 		return error;
@@ -737,6 +768,56 @@ out_invalidate:
 	return;
 }
 
+static int
+xfs_map_cow(
+	struct xfs_writepage_ctx *wpc,
+	struct inode		*inode,
+	loff_t			offset,
+	unsigned int		*new_type)
+{
+	struct xfs_inode	*ip = XFS_I(inode);
+	struct xfs_bmbt_irec	imap;
+	bool			is_cow = false, need_alloc = false;
+	int			error;
+
+	/*
+	 * If we already have a valid COW mapping keep using it.
+	 */
+	if (wpc->io_type == XFS_IO_COW) {
+		wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset);
+		if (wpc->imap_valid) {
+			*new_type = XFS_IO_COW;
+			return 0;
+		}
+	}
+
+	/*
+	 * Else we need to check if there is a COW mapping at this offset.
+	 */
+	xfs_ilock(ip, XFS_ILOCK_SHARED);
+	is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, &need_alloc);
+	xfs_iunlock(ip, XFS_ILOCK_SHARED);
+
+	if (!is_cow)
+		return 0;
+
+	/*
+	 * And if the COW mapping has a delayed extent here we need to
+	 * allocate real space for it now.
+	 */
+	if (need_alloc) {
+		error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset,
+				&imap);
+		if (error)
+			return error;
+	}
+
+	wpc->io_type = *new_type = XFS_IO_COW;
+	wpc->imap_valid = true;
+	wpc->imap = imap;
+	return 0;
+}
+
 /*
  * We implement an immediate ioend submission policy here to avoid needing to
  * chain multiple ioends and hence nest mempool allocations which can violate
@@ -769,6 +850,7 @@ xfs_writepage_map(
 	int			error = 0;
 	int			count = 0;
 	int			uptodate = 1;
+	unsigned int		new_type;
 
 	bh = head = page_buffers(page);
 	offset = page_offset(page);
@@ -789,22 +871,13 @@ xfs_writepage_map(
 			continue;
 		}
 
-		if (buffer_unwritten(bh)) {
-			if (wpc->io_type != XFS_IO_UNWRITTEN) {
-				wpc->io_type = XFS_IO_UNWRITTEN;
-				wpc->imap_valid = false;
-			}
-		} else if (buffer_delay(bh)) {
-			if (wpc->io_type != XFS_IO_DELALLOC) {
-				wpc->io_type = XFS_IO_DELALLOC;
-				wpc->imap_valid = false;
-			}
-		} else if (buffer_uptodate(bh)) {
-			if (wpc->io_type != XFS_IO_OVERWRITE) {
-				wpc->io_type = XFS_IO_OVERWRITE;
-				wpc->imap_valid = false;
-			}
-		} else {
+		if (buffer_unwritten(bh))
+			new_type = XFS_IO_UNWRITTEN;
+		else if (buffer_delay(bh))
+			new_type = XFS_IO_DELALLOC;
+		else if (buffer_uptodate(bh))
+			new_type = XFS_IO_OVERWRITE;
+		else {
 			if (PageUptodate(page))
 				ASSERT(buffer_mapped(bh));
 			/*
@@ -817,6 +890,17 @@ xfs_writepage_map(
 			continue;
 		}
 
+		if (xfs_is_reflink_inode(XFS_I(inode))) {
+			error = xfs_map_cow(wpc, inode, offset, &new_type);
+			if (error)
+				goto out;
+		}
+
+		if (wpc->io_type != new_type) {
+			wpc->io_type = new_type;
+			wpc->imap_valid = false;
+		}
+
 		if (wpc->imap_valid)
 			wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
 							 offset);
@@ -1107,18 +1191,24 @@ xfs_map_direct(
 	struct inode		*inode,
 	struct buffer_head	*bh_result,
 	struct xfs_bmbt_irec	*imap,
-	xfs_off_t		offset)
+	xfs_off_t		offset,
+	bool			is_cow)
 {
 	uintptr_t		*flags = (uintptr_t *)&bh_result->b_private;
 	xfs_off_t		size = bh_result->b_size;
 
 	trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
-		ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : XFS_IO_OVERWRITE, imap);
+		ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW :
+		XFS_IO_OVERWRITE, imap);
 
 	if (ISUNWRITTEN(imap)) {
 		*flags |= XFS_DIO_FLAG_UNWRITTEN;
 		set_buffer_defer_completion(bh_result);
-	} else if (offset + size > i_size_read(inode) || offset + size < 0) {
+	} else if (is_cow) {
+		*flags |= XFS_DIO_FLAG_COW;
+		set_buffer_defer_completion(bh_result);
+	}
+	if (offset + size > i_size_read(inode) || offset + size < 0) {
 		*flags |= XFS_DIO_FLAG_APPEND;
 		set_buffer_defer_completion(bh_result);
 	}
@@ -1164,6 +1254,44 @@ xfs_map_trim_size(
 	bh_result->b_size = mapping_size;
 }
 
+/* Bounce unaligned directio writes to the page cache. */
+static int
+xfs_bounce_unaligned_dio_write(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		offset_fsb,
+	struct xfs_bmbt_irec	*imap)
+{
+	struct xfs_bmbt_irec	irec;
+	xfs_fileoff_t		delta;
+	bool			shared;
+	bool			x;
+	int			error;
+
+	irec = *imap;
+	if (offset_fsb > irec.br_startoff) {
+		delta = offset_fsb - irec.br_startoff;
+		irec.br_blockcount -= delta;
+		irec.br_startblock += delta;
+		irec.br_startoff = offset_fsb;
+	}
+	error = xfs_reflink_trim_around_shared(ip, &irec, &shared, &x);
+	if (error)
+		return error;
+
+	/*
+	 * We're here because we're trying to do a directio write to a
+	 * region that isn't aligned to a filesystem block.  If any part
+	 * of the extent is shared, fall back to buffered mode to handle
+	 * the RMW.  This is done by returning -EREMCHG ("remote addr
+	 * changed"), which is caught further up the call stack.
+	 */
+	if (shared) {
+		trace_xfs_reflink_bounce_dio_write(ip, imap);
+		return -EREMCHG;
+	}
+	return 0;
+}
+
 STATIC int
 __xfs_get_blocks(
 	struct inode		*inode,
@@ -1183,6 +1311,8 @@ __xfs_get_blocks(
 	xfs_off_t		offset;
 	ssize_t			size;
 	int			new = 0;
+	bool			is_cow = false;
+	bool			need_alloc = false;
 
 	BUG_ON(create && !direct);
 
@@ -1208,8 +1338,26 @@ __xfs_get_blocks(
 	end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
 	offset_fsb = XFS_B_TO_FSBT(mp, offset);
 
-	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
-				&imap, &nimaps, XFS_BMAPI_ENTIRE);
+	if (create && direct && xfs_is_reflink_inode(ip))
+		is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap,
+					&need_alloc);
+	if (!is_cow) {
+		error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
+					&imap, &nimaps, XFS_BMAPI_ENTIRE);
+		/*
+		 * Truncate an overwrite extent if there's a pending CoW
+		 * reservation before the end of this extent.  This
+		 * forces us to come back to get_blocks to take care of
+		 * the CoW.
+		 */
+		if (create && direct && nimaps &&
+		    imap.br_startblock != HOLESTARTBLOCK &&
+		    imap.br_startblock != DELAYSTARTBLOCK &&
+		    !ISUNWRITTEN(&imap))
+			xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb,
+					&imap);
+	}
+	ASSERT(!need_alloc);
 	if (error)
 		goto out_unlock;
 
@@ -1261,6 +1409,13 @@ __xfs_get_blocks(
 	if (imap.br_startblock != HOLESTARTBLOCK &&
 	    imap.br_startblock != DELAYSTARTBLOCK &&
 	    (create || !ISUNWRITTEN(&imap))) {
+		if (create && direct && !is_cow) {
+			error = xfs_bounce_unaligned_dio_write(ip, offset_fsb,
+					&imap);
+			if (error)
+				return error;
+		}
+
 		xfs_map_buffer(inode, bh_result, &imap, offset);
 		if (ISUNWRITTEN(&imap))
 			set_buffer_unwritten(bh_result);
@@ -1269,7 +1424,8 @@ __xfs_get_blocks(
 			if (dax_fault)
 				ASSERT(!ISUNWRITTEN(&imap));
 			else
-				xfs_map_direct(inode, bh_result, &imap, offset);
+				xfs_map_direct(inode, bh_result, &imap, offset,
+						is_cow);
 		}
 	}
 
@@ -1391,11 +1547,14 @@ xfs_end_io_direct_write(
 		i_size_write(inode, offset + size);
 	spin_unlock(&ip->i_flags_lock);
 
+	if (flags & XFS_DIO_FLAG_COW)
+		error = xfs_reflink_end_cow(ip, offset, size);
 	if (flags & XFS_DIO_FLAG_UNWRITTEN) {
 		trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
 
 		error = xfs_iomap_write_unwritten(ip, offset, size);
-	} else if (flags & XFS_DIO_FLAG_APPEND) {
+	}
+	if (flags & XFS_DIO_FLAG_APPEND) {
 		trace_xfs_end_io_direct_write_append(ip, offset, size);
 
 		error = xfs_setfilesize(ip, offset, size);
@@ -1425,6 +1584,17 @@ xfs_vm_bmap(
 
 	trace_xfs_vm_bmap(XFS_I(inode));
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
+
+	/*
+	 * The swap code (ab-)uses ->bmap to get a block mapping and then
+	 * bypasseѕ the file system for actual I/O.  We really can't allow
+	 * that on reflinks inodes, so we have to skip out here.  And yes,
+	 * 0 is the magic code for a bmap error..
+	 */
+	if (xfs_is_reflink_inode(ip)) {
+		xfs_iunlock(ip, XFS_IOLOCK_SHARED);
+		return 0;
+	}
 	filemap_write_and_wait(mapping);
 	xfs_iunlock(ip, XFS_IOLOCK_SHARED);
 	return generic_block_bmap(mapping, block, xfs_get_blocks);
diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h
index 1950e3bca2ac..b3c6634f9518 100644
--- a/fs/xfs/xfs_aops.h
+++ b/fs/xfs/xfs_aops.h
@@ -28,13 +28,15 @@ enum {
 	XFS_IO_DELALLOC,	/* covers delalloc region */
 	XFS_IO_UNWRITTEN,	/* covers allocated but uninitialized data */
 	XFS_IO_OVERWRITE,	/* covers already allocated extent */
+	XFS_IO_COW,		/* covers copy-on-write extent */
 };
 
 #define XFS_IO_TYPES \
 	{ XFS_IO_INVALID,		"invalid" }, \
 	{ XFS_IO_DELALLOC,		"delalloc" }, \
 	{ XFS_IO_UNWRITTEN,		"unwritten" }, \
-	{ XFS_IO_OVERWRITE,		"overwrite" }
+	{ XFS_IO_OVERWRITE,		"overwrite" }, \
+	{ XFS_IO_COW,			"CoW" }
 
 /*
  * Structure for buffered I/O completions.
diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c
new file mode 100644
index 000000000000..9bf57c76623b
--- /dev/null
+++ b/fs/xfs/xfs_bmap_item.c
@@ -0,0 +1,508 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_bmap_item.h"
+#include "xfs_log.h"
+#include "xfs_bmap.h"
+#include "xfs_icache.h"
+#include "xfs_trace.h"
+
+
+kmem_zone_t	*xfs_bui_zone;
+kmem_zone_t	*xfs_bud_zone;
+
+static inline struct xfs_bui_log_item *BUI_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_bui_log_item, bui_item);
+}
+
+void
+xfs_bui_item_free(
+	struct xfs_bui_log_item	*buip)
+{
+	kmem_zone_free(xfs_bui_zone, buip);
+}
+
+STATIC void
+xfs_bui_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	struct xfs_bui_log_item	*buip = BUI_ITEM(lip);
+
+	*nvecs += 1;
+	*nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given bui log item. We use only 1 iovec, and we point that
+ * at the bui_log_format structure embedded in the bui item.
+ * It is at this point that we assert that all of the extent
+ * slots in the bui item have been filled.
+ */
+STATIC void
+xfs_bui_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_bui_log_item	*buip = BUI_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	ASSERT(atomic_read(&buip->bui_next_extent) ==
+			buip->bui_format.bui_nextents);
+
+	buip->bui_format.bui_type = XFS_LI_BUI;
+	buip->bui_format.bui_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_BUI_FORMAT, &buip->bui_format,
+			xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents));
+}
+
+/*
+ * Pinning has no meaning for an bui item, so just return.
+ */
+STATIC void
+xfs_bui_item_pin(
+	struct xfs_log_item	*lip)
+{
+}
+
+/*
+ * The unpin operation is the last place an BUI is manipulated in the log. It is
+ * either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the BUI transaction has been successfully committed to make it
+ * this far. Therefore, we expect whoever committed the BUI to either construct
+ * and commit the BUD or drop the BUD's reference in the event of error. Simply
+ * drop the log's BUI reference now that the log is done with it.
+ */
+STATIC void
+xfs_bui_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	struct xfs_bui_log_item	*buip = BUI_ITEM(lip);
+
+	xfs_bui_release(buip);
+}
+
+/*
+ * BUI items have no locking or pushing.  However, since BUIs are pulled from
+ * the AIL when their corresponding BUDs are committed to disk, their situation
+ * is very similar to being pinned.  Return XFS_ITEM_PINNED so that the caller
+ * will eventually flush the log.  This should help in getting the BUI out of
+ * the AIL.
+ */
+STATIC uint
+xfs_bui_item_push(
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list)
+{
+	return XFS_ITEM_PINNED;
+}
+
+/*
+ * The BUI has been either committed or aborted if the transaction has been
+ * cancelled. If the transaction was cancelled, an BUD isn't going to be
+ * constructed and thus we free the BUI here directly.
+ */
+STATIC void
+xfs_bui_item_unlock(
+	struct xfs_log_item	*lip)
+{
+	if (lip->li_flags & XFS_LI_ABORTED)
+		xfs_bui_item_free(BUI_ITEM(lip));
+}
+
+/*
+ * The BUI is logged only once and cannot be moved in the log, so simply return
+ * the lsn at which it's been logged.
+ */
+STATIC xfs_lsn_t
+xfs_bui_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	return lsn;
+}
+
+/*
+ * The BUI dependency tracking op doesn't do squat.  It can't because
+ * it doesn't know where the free extent is coming from.  The dependency
+ * tracking has to be handled by the "enclosing" metadata object.  For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_bui_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all bui log items.
+ */
+static const struct xfs_item_ops xfs_bui_item_ops = {
+	.iop_size	= xfs_bui_item_size,
+	.iop_format	= xfs_bui_item_format,
+	.iop_pin	= xfs_bui_item_pin,
+	.iop_unpin	= xfs_bui_item_unpin,
+	.iop_unlock	= xfs_bui_item_unlock,
+	.iop_committed	= xfs_bui_item_committed,
+	.iop_push	= xfs_bui_item_push,
+	.iop_committing = xfs_bui_item_committing,
+};
+
+/*
+ * Allocate and initialize an bui item with the given number of extents.
+ */
+struct xfs_bui_log_item *
+xfs_bui_init(
+	struct xfs_mount		*mp)
+
+{
+	struct xfs_bui_log_item		*buip;
+
+	buip = kmem_zone_zalloc(xfs_bui_zone, KM_SLEEP);
+
+	xfs_log_item_init(mp, &buip->bui_item, XFS_LI_BUI, &xfs_bui_item_ops);
+	buip->bui_format.bui_nextents = XFS_BUI_MAX_FAST_EXTENTS;
+	buip->bui_format.bui_id = (uintptr_t)(void *)buip;
+	atomic_set(&buip->bui_next_extent, 0);
+	atomic_set(&buip->bui_refcount, 2);
+
+	return buip;
+}
+
+/*
+ * Freeing the BUI requires that we remove it from the AIL if it has already
+ * been placed there. However, the BUI may not yet have been placed in the AIL
+ * when called by xfs_bui_release() from BUD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the BUI.
+ */
+void
+xfs_bui_release(
+	struct xfs_bui_log_item	*buip)
+{
+	if (atomic_dec_and_test(&buip->bui_refcount)) {
+		xfs_trans_ail_remove(&buip->bui_item, SHUTDOWN_LOG_IO_ERROR);
+		xfs_bui_item_free(buip);
+	}
+}
+
+static inline struct xfs_bud_log_item *BUD_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_bud_log_item, bud_item);
+}
+
+STATIC void
+xfs_bud_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	*nvecs += 1;
+	*nbytes += sizeof(struct xfs_bud_log_format);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given bud log item. We use only 1 iovec, and we point that
+ * at the bud_log_format structure embedded in the bud item.
+ * It is at this point that we assert that all of the extent
+ * slots in the bud item have been filled.
+ */
+STATIC void
+xfs_bud_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_bud_log_item	*budp = BUD_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	budp->bud_format.bud_type = XFS_LI_BUD;
+	budp->bud_format.bud_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_BUD_FORMAT, &budp->bud_format,
+			sizeof(struct xfs_bud_log_format));
+}
+
+/*
+ * Pinning has no meaning for an bud item, so just return.
+ */
+STATIC void
+xfs_bud_item_pin(
+	struct xfs_log_item	*lip)
+{
+}
+
+/*
+ * Since pinning has no meaning for an bud item, unpinning does
+ * not either.
+ */
+STATIC void
+xfs_bud_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+}
+
+/*
+ * There isn't much you can do to push on an bud item.  It is simply stuck
+ * waiting for the log to be flushed to disk.
+ */
+STATIC uint
+xfs_bud_item_push(
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list)
+{
+	return XFS_ITEM_PINNED;
+}
+
+/*
+ * The BUD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the BUI and free the
+ * BUD.
+ */
+STATIC void
+xfs_bud_item_unlock(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_bud_log_item	*budp = BUD_ITEM(lip);
+
+	if (lip->li_flags & XFS_LI_ABORTED) {
+		xfs_bui_release(budp->bud_buip);
+		kmem_zone_free(xfs_bud_zone, budp);
+	}
+}
+
+/*
+ * When the bud item is committed to disk, all we need to do is delete our
+ * reference to our partner bui item and then free ourselves. Since we're
+ * freeing ourselves we must return -1 to keep the transaction code from
+ * further referencing this item.
+ */
+STATIC xfs_lsn_t
+xfs_bud_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_bud_log_item	*budp = BUD_ITEM(lip);
+
+	/*
+	 * Drop the BUI reference regardless of whether the BUD has been
+	 * aborted. Once the BUD transaction is constructed, it is the sole
+	 * responsibility of the BUD to release the BUI (even if the BUI is
+	 * aborted due to log I/O error).
+	 */
+	xfs_bui_release(budp->bud_buip);
+	kmem_zone_free(xfs_bud_zone, budp);
+
+	return (xfs_lsn_t)-1;
+}
+
+/*
+ * The BUD dependency tracking op doesn't do squat.  It can't because
+ * it doesn't know where the free extent is coming from.  The dependency
+ * tracking has to be handled by the "enclosing" metadata object.  For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_bud_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all bud log items.
+ */
+static const struct xfs_item_ops xfs_bud_item_ops = {
+	.iop_size	= xfs_bud_item_size,
+	.iop_format	= xfs_bud_item_format,
+	.iop_pin	= xfs_bud_item_pin,
+	.iop_unpin	= xfs_bud_item_unpin,
+	.iop_unlock	= xfs_bud_item_unlock,
+	.iop_committed	= xfs_bud_item_committed,
+	.iop_push	= xfs_bud_item_push,
+	.iop_committing = xfs_bud_item_committing,
+};
+
+/*
+ * Allocate and initialize an bud item with the given number of extents.
+ */
+struct xfs_bud_log_item *
+xfs_bud_init(
+	struct xfs_mount		*mp,
+	struct xfs_bui_log_item		*buip)
+
+{
+	struct xfs_bud_log_item	*budp;
+
+	budp = kmem_zone_zalloc(xfs_bud_zone, KM_SLEEP);
+	xfs_log_item_init(mp, &budp->bud_item, XFS_LI_BUD, &xfs_bud_item_ops);
+	budp->bud_buip = buip;
+	budp->bud_format.bud_bui_id = buip->bui_format.bui_id;
+
+	return budp;
+}
+
+/*
+ * Process a bmap update intent item that was recovered from the log.
+ * We need to update some inode's bmbt.
+ */
+int
+xfs_bui_recover(
+	struct xfs_mount		*mp,
+	struct xfs_bui_log_item		*buip)
+{
+	int				error = 0;
+	unsigned int			bui_type;
+	struct xfs_map_extent		*bmap;
+	xfs_fsblock_t			startblock_fsb;
+	xfs_fsblock_t			inode_fsb;
+	bool				op_ok;
+	struct xfs_bud_log_item		*budp;
+	enum xfs_bmap_intent_type	type;
+	int				whichfork;
+	xfs_exntst_t			state;
+	struct xfs_trans		*tp;
+	struct xfs_inode		*ip = NULL;
+	struct xfs_defer_ops		dfops;
+	xfs_fsblock_t			firstfsb;
+
+	ASSERT(!test_bit(XFS_BUI_RECOVERED, &buip->bui_flags));
+
+	/* Only one mapping operation per BUI... */
+	if (buip->bui_format.bui_nextents != XFS_BUI_MAX_FAST_EXTENTS) {
+		set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
+		xfs_bui_release(buip);
+		return -EIO;
+	}
+
+	/*
+	 * First check the validity of the extent described by the
+	 * BUI.  If anything is bad, then toss the BUI.
+	 */
+	bmap = &buip->bui_format.bui_extents[0];
+	startblock_fsb = XFS_BB_TO_FSB(mp,
+			   XFS_FSB_TO_DADDR(mp, bmap->me_startblock));
+	inode_fsb = XFS_BB_TO_FSB(mp, XFS_FSB_TO_DADDR(mp,
+			XFS_INO_TO_FSB(mp, bmap->me_owner)));
+	switch (bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK) {
+	case XFS_BMAP_MAP:
+	case XFS_BMAP_UNMAP:
+		op_ok = true;
+		break;
+	default:
+		op_ok = false;
+		break;
+	}
+	if (!op_ok || startblock_fsb == 0 ||
+	    bmap->me_len == 0 ||
+	    inode_fsb == 0 ||
+	    startblock_fsb >= mp->m_sb.sb_dblocks ||
+	    bmap->me_len >= mp->m_sb.sb_agblocks ||
+	    inode_fsb >= mp->m_sb.sb_dblocks ||
+	    (bmap->me_flags & ~XFS_BMAP_EXTENT_FLAGS)) {
+		/*
+		 * This will pull the BUI from the AIL and
+		 * free the memory associated with it.
+		 */
+		set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
+		xfs_bui_release(buip);
+		return -EIO;
+	}
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+	if (error)
+		return error;
+	budp = xfs_trans_get_bud(tp, buip);
+
+	/* Grab the inode. */
+	error = xfs_iget(mp, tp, bmap->me_owner, 0, XFS_ILOCK_EXCL, &ip);
+	if (error)
+		goto err_inode;
+
+	if (VFS_I(ip)->i_nlink == 0)
+		xfs_iflags_set(ip, XFS_IRECOVERY);
+	xfs_defer_init(&dfops, &firstfsb);
+
+	/* Process deferred bmap item. */
+	state = (bmap->me_flags & XFS_BMAP_EXTENT_UNWRITTEN) ?
+			XFS_EXT_UNWRITTEN : XFS_EXT_NORM;
+	whichfork = (bmap->me_flags & XFS_BMAP_EXTENT_ATTR_FORK) ?
+			XFS_ATTR_FORK : XFS_DATA_FORK;
+	bui_type = bmap->me_flags & XFS_BMAP_EXTENT_TYPE_MASK;
+	switch (bui_type) {
+	case XFS_BMAP_MAP:
+	case XFS_BMAP_UNMAP:
+		type = bui_type;
+		break;
+	default:
+		error = -EFSCORRUPTED;
+		goto err_dfops;
+	}
+	xfs_trans_ijoin(tp, ip, 0);
+
+	error = xfs_trans_log_finish_bmap_update(tp, budp, &dfops, type,
+			ip, whichfork, bmap->me_startoff,
+			bmap->me_startblock, bmap->me_len,
+			state);
+	if (error)
+		goto err_dfops;
+
+	/* Finish transaction, free inodes. */
+	error = xfs_defer_finish(&tp, &dfops, NULL);
+	if (error)
+		goto err_dfops;
+
+	set_bit(XFS_BUI_RECOVERED, &buip->bui_flags);
+	error = xfs_trans_commit(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	IRELE(ip);
+
+	return error;
+
+err_dfops:
+	xfs_defer_cancel(&dfops);
+err_inode:
+	xfs_trans_cancel(tp);
+	if (ip) {
+		xfs_iunlock(ip, XFS_ILOCK_EXCL);
+		IRELE(ip);
+	}
+	return error;
+}
diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h
new file mode 100644
index 000000000000..c867daae4a3c
--- /dev/null
+++ b/fs/xfs/xfs_bmap_item.h
@@ -0,0 +1,98 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#ifndef	__XFS_BMAP_ITEM_H__
+#define	__XFS_BMAP_ITEM_H__
+
+/*
+ * There are (currently) two pairs of bmap btree redo item types: map & unmap.
+ * The common abbreviations for these are BUI (bmap update intent) and BUD
+ * (bmap update done).  The redo item type is encoded in the flags field of
+ * each xfs_map_extent.
+ *
+ * *I items should be recorded in the *first* of a series of rolled
+ * transactions, and the *D items should be recorded in the same transaction
+ * that records the associated bmbt updates.
+ *
+ * Should the system crash after the commit of the first transaction but
+ * before the commit of the final transaction in a series, log recovery will
+ * use the redo information recorded by the intent items to replay the
+ * bmbt metadata updates in the non-first transaction.
+ */
+
+/* kernel only BUI/BUD definitions */
+
+struct xfs_mount;
+struct kmem_zone;
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define	XFS_BUI_MAX_FAST_EXTENTS	1
+
+/*
+ * Define BUI flag bits. Manipulated by set/clear/test_bit operators.
+ */
+#define	XFS_BUI_RECOVERED		1
+
+/*
+ * This is the "bmap update intent" log item.  It is used to log the fact that
+ * some reverse mappings need to change.  It is used in conjunction with the
+ * "bmap update done" log item described below.
+ *
+ * These log items follow the same rules as struct xfs_efi_log_item; see the
+ * comments about that structure (in xfs_extfree_item.h) for more details.
+ */
+struct xfs_bui_log_item {
+	struct xfs_log_item		bui_item;
+	atomic_t			bui_refcount;
+	atomic_t			bui_next_extent;
+	unsigned long			bui_flags;	/* misc flags */
+	struct xfs_bui_log_format	bui_format;
+};
+
+static inline size_t
+xfs_bui_log_item_sizeof(
+	unsigned int		nr)
+{
+	return offsetof(struct xfs_bui_log_item, bui_format) +
+			xfs_bui_log_format_sizeof(nr);
+}
+
+/*
+ * This is the "bmap update done" log item.  It is used to log the fact that
+ * some bmbt updates mentioned in an earlier bui item have been performed.
+ */
+struct xfs_bud_log_item {
+	struct xfs_log_item		bud_item;
+	struct xfs_bui_log_item		*bud_buip;
+	struct xfs_bud_log_format	bud_format;
+};
+
+extern struct kmem_zone	*xfs_bui_zone;
+extern struct kmem_zone	*xfs_bud_zone;
+
+struct xfs_bui_log_item *xfs_bui_init(struct xfs_mount *);
+struct xfs_bud_log_item *xfs_bud_init(struct xfs_mount *,
+		struct xfs_bui_log_item *);
+void xfs_bui_item_free(struct xfs_bui_log_item *);
+void xfs_bui_release(struct xfs_bui_log_item *);
+int xfs_bui_recover(struct xfs_mount *mp, struct xfs_bui_log_item *buip);
+
+#endif	/* __XFS_BMAP_ITEM_H__ */
diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c
index e827d657c314..552465e011ec 100644
--- a/fs/xfs/xfs_bmap_util.c
+++ b/fs/xfs/xfs_bmap_util.c
@@ -42,6 +42,9 @@
 #include "xfs_icache.h"
 #include "xfs_log.h"
 #include "xfs_rmap_btree.h"
+#include "xfs_iomap.h"
+#include "xfs_reflink.h"
+#include "xfs_refcount.h"
 
 /* Kernel only BMAP related definitions and functions */
 
@@ -389,11 +392,13 @@ xfs_bmap_count_blocks(
 STATIC int
 xfs_getbmapx_fix_eof_hole(
 	xfs_inode_t		*ip,		/* xfs incore inode pointer */
+	int			whichfork,
 	struct getbmapx		*out,		/* output structure */
 	int			prealloced,	/* this is a file with
 						 * preallocated data space */
 	__int64_t		end,		/* last block requested */
-	xfs_fsblock_t		startblock)
+	xfs_fsblock_t		startblock,
+	bool			moretocome)
 {
 	__int64_t		fixlen;
 	xfs_mount_t		*mp;		/* file system mount point */
@@ -418,8 +423,9 @@ xfs_getbmapx_fix_eof_hole(
 		else
 			out->bmv_block = xfs_fsb_to_db(ip, startblock);
 		fileblock = XFS_BB_TO_FSB(ip->i_mount, out->bmv_offset);
-		ifp = XFS_IFORK_PTR(ip, XFS_DATA_FORK);
-		if (xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
+		ifp = XFS_IFORK_PTR(ip, whichfork);
+		if (!moretocome &&
+		    xfs_iext_bno_to_ext(ifp, fileblock, &lastx) &&
 		   (lastx == (ifp->if_bytes / (uint)sizeof(xfs_bmbt_rec_t))-1))
 			out->bmv_oflags |= BMV_OF_LAST;
 	}
@@ -427,6 +433,81 @@ xfs_getbmapx_fix_eof_hole(
 	return 1;
 }
 
+/* Adjust the reported bmap around shared/unshared extent transitions. */
+STATIC int
+xfs_getbmap_adjust_shared(
+	struct xfs_inode		*ip,
+	int				whichfork,
+	struct xfs_bmbt_irec		*map,
+	struct getbmapx			*out,
+	struct xfs_bmbt_irec		*next_map)
+{
+	struct xfs_mount		*mp = ip->i_mount;
+	xfs_agnumber_t			agno;
+	xfs_agblock_t			agbno;
+	xfs_agblock_t			ebno;
+	xfs_extlen_t			elen;
+	xfs_extlen_t			nlen;
+	int				error;
+
+	next_map->br_startblock = NULLFSBLOCK;
+	next_map->br_startoff = NULLFILEOFF;
+	next_map->br_blockcount = 0;
+
+	/* Only written data blocks can be shared. */
+	if (!xfs_is_reflink_inode(ip) || whichfork != XFS_DATA_FORK ||
+	    map->br_startblock == DELAYSTARTBLOCK ||
+	    map->br_startblock == HOLESTARTBLOCK ||
+	    ISUNWRITTEN(map))
+		return 0;
+
+	agno = XFS_FSB_TO_AGNO(mp, map->br_startblock);
+	agbno = XFS_FSB_TO_AGBNO(mp, map->br_startblock);
+	error = xfs_reflink_find_shared(mp, agno, agbno, map->br_blockcount,
+			&ebno, &elen, true);
+	if (error)
+		return error;
+
+	if (ebno == NULLAGBLOCK) {
+		/* No shared blocks at all. */
+		return 0;
+	} else if (agbno == ebno) {
+		/*
+		 * Shared extent at (agbno, elen).  Shrink the reported
+		 * extent length and prepare to move the start of map[i]
+		 * to agbno+elen, with the aim of (re)formatting the new
+		 * map[i] the next time through the inner loop.
+		 */
+		out->bmv_length = XFS_FSB_TO_BB(mp, elen);
+		out->bmv_oflags |= BMV_OF_SHARED;
+		if (elen != map->br_blockcount) {
+			*next_map = *map;
+			next_map->br_startblock += elen;
+			next_map->br_startoff += elen;
+			next_map->br_blockcount -= elen;
+		}
+		map->br_blockcount -= elen;
+	} else {
+		/*
+		 * There's an unshared extent (agbno, ebno - agbno)
+		 * followed by shared extent at (ebno, elen).  Shrink
+		 * the reported extent length to cover only the unshared
+		 * extent and prepare to move up the start of map[i] to
+		 * ebno, with the aim of (re)formatting the new map[i]
+		 * the next time through the inner loop.
+		 */
+		*next_map = *map;
+		nlen = ebno - agbno;
+		out->bmv_length = XFS_FSB_TO_BB(mp, nlen);
+		next_map->br_startblock += nlen;
+		next_map->br_startoff += nlen;
+		next_map->br_blockcount -= nlen;
+		map->br_blockcount -= nlen;
+	}
+
+	return 0;
+}
+
 /*
  * Get inode's extents as described in bmv, and format for output.
  * Calls formatter to fill the user's buffer until all extents
@@ -459,12 +540,28 @@ xfs_getbmap(
 	int			iflags;		/* interface flags */
 	int			bmapi_flags;	/* flags for xfs_bmapi */
 	int			cur_ext = 0;
+	struct xfs_bmbt_irec	inject_map;
 
 	mp = ip->i_mount;
 	iflags = bmv->bmv_iflags;
-	whichfork = iflags & BMV_IF_ATTRFORK ? XFS_ATTR_FORK : XFS_DATA_FORK;
 
-	if (whichfork == XFS_ATTR_FORK) {
+#ifndef DEBUG
+	/* Only allow CoW fork queries if we're debugging. */
+	if (iflags & BMV_IF_COWFORK)
+		return -EINVAL;
+#endif
+	if ((iflags & BMV_IF_ATTRFORK) && (iflags & BMV_IF_COWFORK))
+		return -EINVAL;
+
+	if (iflags & BMV_IF_ATTRFORK)
+		whichfork = XFS_ATTR_FORK;
+	else if (iflags & BMV_IF_COWFORK)
+		whichfork = XFS_COW_FORK;
+	else
+		whichfork = XFS_DATA_FORK;
+
+	switch (whichfork) {
+	case XFS_ATTR_FORK:
 		if (XFS_IFORK_Q(ip)) {
 			if (ip->i_d.di_aformat != XFS_DINODE_FMT_EXTENTS &&
 			    ip->i_d.di_aformat != XFS_DINODE_FMT_BTREE &&
@@ -480,7 +577,20 @@ xfs_getbmap(
 
 		prealloced = 0;
 		fixlen = 1LL << 32;
-	} else {
+		break;
+	case XFS_COW_FORK:
+		if (ip->i_cformat != XFS_DINODE_FMT_EXTENTS)
+			return -EINVAL;
+
+		if (xfs_get_cowextsz_hint(ip)) {
+			prealloced = 1;
+			fixlen = mp->m_super->s_maxbytes;
+		} else {
+			prealloced = 0;
+			fixlen = XFS_ISIZE(ip);
+		}
+		break;
+	default:
 		if (ip->i_d.di_format != XFS_DINODE_FMT_EXTENTS &&
 		    ip->i_d.di_format != XFS_DINODE_FMT_BTREE &&
 		    ip->i_d.di_format != XFS_DINODE_FMT_LOCAL)
@@ -494,6 +604,7 @@ xfs_getbmap(
 			prealloced = 0;
 			fixlen = XFS_ISIZE(ip);
 		}
+		break;
 	}
 
 	if (bmv->bmv_length == -1) {
@@ -520,7 +631,8 @@ xfs_getbmap(
 		return -ENOMEM;
 
 	xfs_ilock(ip, XFS_IOLOCK_SHARED);
-	if (whichfork == XFS_DATA_FORK) {
+	switch (whichfork) {
+	case XFS_DATA_FORK:
 		if (!(iflags & BMV_IF_DELALLOC) &&
 		    (ip->i_delayed_blks || XFS_ISIZE(ip) > ip->i_d.di_size)) {
 			error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
@@ -538,8 +650,14 @@ xfs_getbmap(
 		}
 
 		lock = xfs_ilock_data_map_shared(ip);
-	} else {
+		break;
+	case XFS_COW_FORK:
+		lock = XFS_ILOCK_SHARED;
+		xfs_ilock(ip, lock);
+		break;
+	case XFS_ATTR_FORK:
 		lock = xfs_ilock_attr_map_shared(ip);
+		break;
 	}
 
 	/*
@@ -581,7 +699,8 @@ xfs_getbmap(
 			goto out_free_map;
 		ASSERT(nmap <= subnex);
 
-		for (i = 0; i < nmap && nexleft && bmv->bmv_length; i++) {
+		for (i = 0; i < nmap && nexleft && bmv->bmv_length &&
+				cur_ext < bmv->bmv_count; i++) {
 			out[cur_ext].bmv_oflags = 0;
 			if (map[i].br_state == XFS_EXT_UNWRITTEN)
 				out[cur_ext].bmv_oflags |= BMV_OF_PREALLOC;
@@ -614,9 +733,16 @@ xfs_getbmap(
 				goto out_free_map;
 			}
 
-			if (!xfs_getbmapx_fix_eof_hole(ip, &out[cur_ext],
-					prealloced, bmvend,
-					map[i].br_startblock))
+			/* Is this a shared block? */
+			error = xfs_getbmap_adjust_shared(ip, whichfork,
+					&map[i], &out[cur_ext], &inject_map);
+			if (error)
+				goto out_free_map;
+
+			if (!xfs_getbmapx_fix_eof_hole(ip, whichfork,
+					&out[cur_ext], prealloced, bmvend,
+					map[i].br_startblock,
+					inject_map.br_startblock != NULLFSBLOCK))
 				goto out_free_map;
 
 			bmv->bmv_offset =
@@ -636,11 +762,16 @@ xfs_getbmap(
 				continue;
 			}
 
-			nexleft--;
+			if (inject_map.br_startblock != NULLFSBLOCK) {
+				map[i] = inject_map;
+				i--;
+			} else
+				nexleft--;
 			bmv->bmv_entries++;
 			cur_ext++;
 		}
-	} while (nmap && nexleft && bmv->bmv_length);
+	} while (nmap && nexleft && bmv->bmv_length &&
+		 cur_ext < bmv->bmv_count);
 
  out_free_map:
 	kmem_free(map);
@@ -1433,8 +1564,8 @@ xfs_insert_file_space(
  */
 static int
 xfs_swap_extents_check_format(
-	xfs_inode_t	*ip,	/* target inode */
-	xfs_inode_t	*tip)	/* tmp inode */
+	struct xfs_inode	*ip,	/* target inode */
+	struct xfs_inode	*tip)	/* tmp inode */
 {
 
 	/* Should never get a local format */
@@ -1450,6 +1581,13 @@ xfs_swap_extents_check_format(
 		return -EINVAL;
 
 	/*
+	 * If we have to use the (expensive) rmap swap method, we can
+	 * handle any number of extents and any format.
+	 */
+	if (xfs_sb_version_hasrmapbt(&ip->i_mount->m_sb))
+		return 0;
+
+	/*
 	 * if the target inode is in extent form and the temp inode is in btree
 	 * form then we will end up with the target inode in the wrong format
 	 * as we already know there are less extents in the temp inode.
@@ -1518,125 +1656,161 @@ xfs_swap_extent_flush(
 	return 0;
 }
 
-int
-xfs_swap_extents(
-	xfs_inode_t	*ip,	/* target inode */
-	xfs_inode_t	*tip,	/* tmp inode */
-	xfs_swapext_t	*sxp)
+/*
+ * Move extents from one file to another, when rmap is enabled.
+ */
+STATIC int
+xfs_swap_extent_rmap(
+	struct xfs_trans		**tpp,
+	struct xfs_inode		*ip,
+	struct xfs_inode		*tip)
 {
-	xfs_mount_t	*mp = ip->i_mount;
-	xfs_trans_t	*tp;
-	xfs_bstat_t	*sbp = &sxp->sx_stat;
-	xfs_ifork_t	*tempifp, *ifp, *tifp;
-	int		src_log_flags, target_log_flags;
-	int		error = 0;
-	int		aforkblks = 0;
-	int		taforkblks = 0;
-	__uint64_t	tmp;
-	int		lock_flags;
-
-	/* XXX: we can't do this with rmap, will fix later */
-	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
-		return -EOPNOTSUPP;
-
-	tempifp = kmem_alloc(sizeof(xfs_ifork_t), KM_MAYFAIL);
-	if (!tempifp) {
-		error = -ENOMEM;
-		goto out;
-	}
+	struct xfs_bmbt_irec		irec;
+	struct xfs_bmbt_irec		uirec;
+	struct xfs_bmbt_irec		tirec;
+	xfs_fileoff_t			offset_fsb;
+	xfs_fileoff_t			end_fsb;
+	xfs_filblks_t			count_fsb;
+	xfs_fsblock_t			firstfsb;
+	struct xfs_defer_ops		dfops;
+	int				error;
+	xfs_filblks_t			ilen;
+	xfs_filblks_t			rlen;
+	int				nimaps;
+	__uint64_t			tip_flags2;
 
 	/*
-	 * Lock the inodes against other IO, page faults and truncate to
-	 * begin with.  Then we can ensure the inodes are flushed and have no
-	 * page cache safely. Once we have done this we can take the ilocks and
-	 * do the rest of the checks.
+	 * If the source file has shared blocks, we must flag the donor
+	 * file as having shared blocks so that we get the shared-block
+	 * rmap functions when we go to fix up the rmaps.  The flags
+	 * will be switch for reals later.
 	 */
-	lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
-	xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
-	xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
-
-	/* Verify that both files have the same format */
-	if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
-		error = -EINVAL;
-		goto out_unlock;
-	}
+	tip_flags2 = tip->i_d.di_flags2;
+	if (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK)
+		tip->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
+
+	offset_fsb = 0;
+	end_fsb = XFS_B_TO_FSB(ip->i_mount, i_size_read(VFS_I(ip)));
+	count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb);
+
+	while (count_fsb) {
+		/* Read extent from the donor file */
+		nimaps = 1;
+		error = xfs_bmapi_read(tip, offset_fsb, count_fsb, &tirec,
+				&nimaps, 0);
+		if (error)
+			goto out;
+		ASSERT(nimaps == 1);
+		ASSERT(tirec.br_startblock != DELAYSTARTBLOCK);
+
+		trace_xfs_swap_extent_rmap_remap(tip, &tirec);
+		ilen = tirec.br_blockcount;
+
+		/* Unmap the old blocks in the source file. */
+		while (tirec.br_blockcount) {
+			xfs_defer_init(&dfops, &firstfsb);
+			trace_xfs_swap_extent_rmap_remap_piece(tip, &tirec);
+
+			/* Read extent from the source file */
+			nimaps = 1;
+			error = xfs_bmapi_read(ip, tirec.br_startoff,
+					tirec.br_blockcount, &irec,
+					&nimaps, 0);
+			if (error)
+				goto out_defer;
+			ASSERT(nimaps == 1);
+			ASSERT(tirec.br_startoff == irec.br_startoff);
+			trace_xfs_swap_extent_rmap_remap_piece(ip, &irec);
+
+			/* Trim the extent. */
+			uirec = tirec;
+			uirec.br_blockcount = rlen = min_t(xfs_filblks_t,
+					tirec.br_blockcount,
+					irec.br_blockcount);
+			trace_xfs_swap_extent_rmap_remap_piece(tip, &uirec);
+
+			/* Remove the mapping from the donor file. */
+			error = xfs_bmap_unmap_extent((*tpp)->t_mountp, &dfops,
+					tip, &uirec);
+			if (error)
+				goto out_defer;
 
-	/* Verify both files are either real-time or non-realtime */
-	if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
-		error = -EINVAL;
-		goto out_unlock;
-	}
+			/* Remove the mapping from the source file. */
+			error = xfs_bmap_unmap_extent((*tpp)->t_mountp, &dfops,
+					ip, &irec);
+			if (error)
+				goto out_defer;
 
-	error = xfs_swap_extent_flush(ip);
-	if (error)
-		goto out_unlock;
-	error = xfs_swap_extent_flush(tip);
-	if (error)
-		goto out_unlock;
+			/* Map the donor file's blocks into the source file. */
+			error = xfs_bmap_map_extent((*tpp)->t_mountp, &dfops,
+					ip, &uirec);
+			if (error)
+				goto out_defer;
 
-	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
-	if (error)
-		goto out_unlock;
+			/* Map the source file's blocks into the donor file. */
+			error = xfs_bmap_map_extent((*tpp)->t_mountp, &dfops,
+					tip, &irec);
+			if (error)
+				goto out_defer;
 
-	/*
-	 * Lock and join the inodes to the tansaction so that transaction commit
-	 * or cancel will unlock the inodes from this point onwards.
-	 */
-	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
-	lock_flags |= XFS_ILOCK_EXCL;
-	xfs_trans_ijoin(tp, ip, lock_flags);
-	xfs_trans_ijoin(tp, tip, lock_flags);
+			error = xfs_defer_finish(tpp, &dfops, ip);
+			if (error)
+				goto out_defer;
 
+			tirec.br_startoff += rlen;
+			if (tirec.br_startblock != HOLESTARTBLOCK &&
+			    tirec.br_startblock != DELAYSTARTBLOCK)
+				tirec.br_startblock += rlen;
+			tirec.br_blockcount -= rlen;
+		}
 
-	/* Verify all data are being swapped */
-	if (sxp->sx_offset != 0 ||
-	    sxp->sx_length != ip->i_d.di_size ||
-	    sxp->sx_length != tip->i_d.di_size) {
-		error = -EFAULT;
-		goto out_trans_cancel;
+		/* Roll on... */
+		count_fsb -= ilen;
+		offset_fsb += ilen;
 	}
 
-	trace_xfs_swap_extent_before(ip, 0);
-	trace_xfs_swap_extent_before(tip, 1);
+	tip->i_d.di_flags2 = tip_flags2;
+	return 0;
 
-	/* check inode formats now that data is flushed */
-	error = xfs_swap_extents_check_format(ip, tip);
-	if (error) {
-		xfs_notice(mp,
-		    "%s: inode 0x%llx format is incompatible for exchanging.",
-				__func__, ip->i_ino);
-		goto out_trans_cancel;
-	}
+out_defer:
+	xfs_defer_cancel(&dfops);
+out:
+	trace_xfs_swap_extent_rmap_error(ip, error, _RET_IP_);
+	tip->i_d.di_flags2 = tip_flags2;
+	return error;
+}
+
+/* Swap the extents of two files by swapping data forks. */
+STATIC int
+xfs_swap_extent_forks(
+	struct xfs_trans	*tp,
+	struct xfs_inode	*ip,
+	struct xfs_inode	*tip,
+	int			*src_log_flags,
+	int			*target_log_flags)
+{
+	struct xfs_ifork	tempifp, *ifp, *tifp;
+	int			aforkblks = 0;
+	int			taforkblks = 0;
+	__uint64_t		tmp;
+	int			error;
 
-	/*
-	 * Compare the current change & modify times with that
-	 * passed in.  If they differ, we abort this swap.
-	 * This is the mechanism used to ensure the calling
-	 * process that the file was not changed out from
-	 * under it.
-	 */
-	if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
-	    (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
-	    (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
-	    (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
-		error = -EBUSY;
-		goto out_trans_cancel;
-	}
 	/*
 	 * Count the number of extended attribute blocks
 	 */
 	if ( ((XFS_IFORK_Q(ip) != 0) && (ip->i_d.di_anextents > 0)) &&
 	     (ip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
-		error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK, &aforkblks);
+		error = xfs_bmap_count_blocks(tp, ip, XFS_ATTR_FORK,
+				&aforkblks);
 		if (error)
-			goto out_trans_cancel;
+			return error;
 	}
 	if ( ((XFS_IFORK_Q(tip) != 0) && (tip->i_d.di_anextents > 0)) &&
 	     (tip->i_d.di_aformat != XFS_DINODE_FMT_LOCAL)) {
 		error = xfs_bmap_count_blocks(tp, tip, XFS_ATTR_FORK,
-			&taforkblks);
+				&taforkblks);
 		if (error)
-			goto out_trans_cancel;
+			return error;
 	}
 
 	/*
@@ -1645,31 +1819,23 @@ xfs_swap_extents(
 	 * buffers, and so the validation done on read will expect the owner
 	 * field to be correctly set. Once we change the owners, we can swap the
 	 * inode forks.
-	 *
-	 * Note the trickiness in setting the log flags - we set the owner log
-	 * flag on the opposite inode (i.e. the inode we are setting the new
-	 * owner to be) because once we swap the forks and log that, log
-	 * recovery is going to see the fork as owned by the swapped inode,
-	 * not the pre-swapped inodes.
 	 */
-	src_log_flags = XFS_ILOG_CORE;
-	target_log_flags = XFS_ILOG_CORE;
 	if (ip->i_d.di_version == 3 &&
 	    ip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
-		target_log_flags |= XFS_ILOG_DOWNER;
+		(*target_log_flags) |= XFS_ILOG_DOWNER;
 		error = xfs_bmbt_change_owner(tp, ip, XFS_DATA_FORK,
 					      tip->i_ino, NULL);
 		if (error)
-			goto out_trans_cancel;
+			return error;
 	}
 
 	if (tip->i_d.di_version == 3 &&
 	    tip->i_d.di_format == XFS_DINODE_FMT_BTREE) {
-		src_log_flags |= XFS_ILOG_DOWNER;
+		(*src_log_flags) |= XFS_ILOG_DOWNER;
 		error = xfs_bmbt_change_owner(tp, tip, XFS_DATA_FORK,
 					      ip->i_ino, NULL);
 		if (error)
-			goto out_trans_cancel;
+			return error;
 	}
 
 	/*
@@ -1677,9 +1843,9 @@ xfs_swap_extents(
 	 */
 	ifp = &ip->i_df;
 	tifp = &tip->i_df;
-	*tempifp = *ifp;	/* struct copy */
+	tempifp = *ifp;		/* struct copy */
 	*ifp = *tifp;		/* struct copy */
-	*tifp = *tempifp;	/* struct copy */
+	*tifp = tempifp;	/* struct copy */
 
 	/*
 	 * Fix the on-disk inode values
@@ -1719,12 +1885,12 @@ xfs_swap_extents(
 			ifp->if_u1.if_extents =
 				ifp->if_u2.if_inline_ext;
 		}
-		src_log_flags |= XFS_ILOG_DEXT;
+		(*src_log_flags) |= XFS_ILOG_DEXT;
 		break;
 	case XFS_DINODE_FMT_BTREE:
 		ASSERT(ip->i_d.di_version < 3 ||
-		       (src_log_flags & XFS_ILOG_DOWNER));
-		src_log_flags |= XFS_ILOG_DBROOT;
+		       (*src_log_flags & XFS_ILOG_DOWNER));
+		(*src_log_flags) |= XFS_ILOG_DBROOT;
 		break;
 	}
 
@@ -1738,15 +1904,166 @@ xfs_swap_extents(
 			tifp->if_u1.if_extents =
 				tifp->if_u2.if_inline_ext;
 		}
-		target_log_flags |= XFS_ILOG_DEXT;
+		(*target_log_flags) |= XFS_ILOG_DEXT;
 		break;
 	case XFS_DINODE_FMT_BTREE:
-		target_log_flags |= XFS_ILOG_DBROOT;
+		(*target_log_flags) |= XFS_ILOG_DBROOT;
 		ASSERT(tip->i_d.di_version < 3 ||
-		       (target_log_flags & XFS_ILOG_DOWNER));
+		       (*target_log_flags & XFS_ILOG_DOWNER));
 		break;
 	}
 
+	return 0;
+}
+
+int
+xfs_swap_extents(
+	struct xfs_inode	*ip,	/* target inode */
+	struct xfs_inode	*tip,	/* tmp inode */
+	struct xfs_swapext	*sxp)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	struct xfs_bstat	*sbp = &sxp->sx_stat;
+	int			src_log_flags, target_log_flags;
+	int			error = 0;
+	int			lock_flags;
+	struct xfs_ifork	*cowfp;
+	__uint64_t		f;
+	int			resblks;
+
+	/*
+	 * Lock the inodes against other IO, page faults and truncate to
+	 * begin with.  Then we can ensure the inodes are flushed and have no
+	 * page cache safely. Once we have done this we can take the ilocks and
+	 * do the rest of the checks.
+	 */
+	lock_flags = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL;
+	xfs_lock_two_inodes(ip, tip, XFS_IOLOCK_EXCL);
+	xfs_lock_two_inodes(ip, tip, XFS_MMAPLOCK_EXCL);
+
+	/* Verify that both files have the same format */
+	if ((VFS_I(ip)->i_mode & S_IFMT) != (VFS_I(tip)->i_mode & S_IFMT)) {
+		error = -EINVAL;
+		goto out_unlock;
+	}
+
+	/* Verify both files are either real-time or non-realtime */
+	if (XFS_IS_REALTIME_INODE(ip) != XFS_IS_REALTIME_INODE(tip)) {
+		error = -EINVAL;
+		goto out_unlock;
+	}
+
+	error = xfs_swap_extent_flush(ip);
+	if (error)
+		goto out_unlock;
+	error = xfs_swap_extent_flush(tip);
+	if (error)
+		goto out_unlock;
+
+	/*
+	 * Extent "swapping" with rmap requires a permanent reservation and
+	 * a block reservation because it's really just a remap operation
+	 * performed with log redo items!
+	 */
+	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
+		/*
+		 * Conceptually this shouldn't affect the shape of either
+		 * bmbt, but since we atomically move extents one by one,
+		 * we reserve enough space to rebuild both trees.
+		 */
+		resblks = XFS_SWAP_RMAP_SPACE_RES(mp,
+				XFS_IFORK_NEXTENTS(ip, XFS_DATA_FORK),
+				XFS_DATA_FORK) +
+			  XFS_SWAP_RMAP_SPACE_RES(mp,
+				XFS_IFORK_NEXTENTS(tip, XFS_DATA_FORK),
+				XFS_DATA_FORK);
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks,
+				0, 0, &tp);
+	} else
+		error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0,
+				0, 0, &tp);
+	if (error)
+		goto out_unlock;
+
+	/*
+	 * Lock and join the inodes to the tansaction so that transaction commit
+	 * or cancel will unlock the inodes from this point onwards.
+	 */
+	xfs_lock_two_inodes(ip, tip, XFS_ILOCK_EXCL);
+	lock_flags |= XFS_ILOCK_EXCL;
+	xfs_trans_ijoin(tp, ip, 0);
+	xfs_trans_ijoin(tp, tip, 0);
+
+
+	/* Verify all data are being swapped */
+	if (sxp->sx_offset != 0 ||
+	    sxp->sx_length != ip->i_d.di_size ||
+	    sxp->sx_length != tip->i_d.di_size) {
+		error = -EFAULT;
+		goto out_trans_cancel;
+	}
+
+	trace_xfs_swap_extent_before(ip, 0);
+	trace_xfs_swap_extent_before(tip, 1);
+
+	/* check inode formats now that data is flushed */
+	error = xfs_swap_extents_check_format(ip, tip);
+	if (error) {
+		xfs_notice(mp,
+		    "%s: inode 0x%llx format is incompatible for exchanging.",
+				__func__, ip->i_ino);
+		goto out_trans_cancel;
+	}
+
+	/*
+	 * Compare the current change & modify times with that
+	 * passed in.  If they differ, we abort this swap.
+	 * This is the mechanism used to ensure the calling
+	 * process that the file was not changed out from
+	 * under it.
+	 */
+	if ((sbp->bs_ctime.tv_sec != VFS_I(ip)->i_ctime.tv_sec) ||
+	    (sbp->bs_ctime.tv_nsec != VFS_I(ip)->i_ctime.tv_nsec) ||
+	    (sbp->bs_mtime.tv_sec != VFS_I(ip)->i_mtime.tv_sec) ||
+	    (sbp->bs_mtime.tv_nsec != VFS_I(ip)->i_mtime.tv_nsec)) {
+		error = -EBUSY;
+		goto out_trans_cancel;
+	}
+
+	/*
+	 * Note the trickiness in setting the log flags - we set the owner log
+	 * flag on the opposite inode (i.e. the inode we are setting the new
+	 * owner to be) because once we swap the forks and log that, log
+	 * recovery is going to see the fork as owned by the swapped inode,
+	 * not the pre-swapped inodes.
+	 */
+	src_log_flags = XFS_ILOG_CORE;
+	target_log_flags = XFS_ILOG_CORE;
+
+	if (xfs_sb_version_hasrmapbt(&mp->m_sb))
+		error = xfs_swap_extent_rmap(&tp, ip, tip);
+	else
+		error = xfs_swap_extent_forks(tp, ip, tip, &src_log_flags,
+				&target_log_flags);
+	if (error)
+		goto out_trans_cancel;
+
+	/* Do we have to swap reflink flags? */
+	if ((ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK) ^
+	    (tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK)) {
+		f = ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
+		ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+		ip->i_d.di_flags2 |= tip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
+		tip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+		tip->i_d.di_flags2 |= f & XFS_DIFLAG2_REFLINK;
+		cowfp = ip->i_cowfp;
+		ip->i_cowfp = tip->i_cowfp;
+		tip->i_cowfp = cowfp;
+		xfs_inode_set_cowblocks_tag(ip);
+		xfs_inode_set_cowblocks_tag(tip);
+	}
+
 	xfs_trans_log_inode(tp, ip,  src_log_flags);
 	xfs_trans_log_inode(tp, tip, target_log_flags);
 
@@ -1761,16 +2078,16 @@ xfs_swap_extents(
 
 	trace_xfs_swap_extent_after(ip, 0);
 	trace_xfs_swap_extent_after(tip, 1);
-out:
-	kmem_free(tempifp);
-	return error;
 
-out_unlock:
 	xfs_iunlock(ip, lock_flags);
 	xfs_iunlock(tip, lock_flags);
-	goto out;
+	return error;
 
 out_trans_cancel:
 	xfs_trans_cancel(tp);
-	goto out;
+
+out_unlock:
+	xfs_iunlock(ip, lock_flags);
+	xfs_iunlock(tip, lock_flags);
+	return error;
 }
diff --git a/fs/xfs/xfs_dir2_readdir.c b/fs/xfs/xfs_dir2_readdir.c
index f44f79996978..29816981b50a 100644
--- a/fs/xfs/xfs_dir2_readdir.c
+++ b/fs/xfs/xfs_dir2_readdir.c
@@ -84,7 +84,8 @@ xfs_dir2_sf_getdents(
 
 	sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
 
-	ASSERT(dp->i_d.di_size >= xfs_dir2_sf_hdr_size(sfp->i8count));
+	if (dp->i_d.di_size < xfs_dir2_sf_hdr_size(sfp->i8count))
+		return -EFSCORRUPTED;
 
 	/*
 	 * If the block number in the offset is out of range, we're done.
diff --git a/fs/xfs/xfs_error.h b/fs/xfs/xfs_error.h
index 3d224702fbc0..05f8666733a0 100644
--- a/fs/xfs/xfs_error.h
+++ b/fs/xfs/xfs_error.h
@@ -92,7 +92,11 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
 #define XFS_ERRTAG_BMAPIFORMAT				21
 #define XFS_ERRTAG_FREE_EXTENT				22
 #define XFS_ERRTAG_RMAP_FINISH_ONE			23
-#define XFS_ERRTAG_MAX					24
+#define XFS_ERRTAG_REFCOUNT_CONTINUE_UPDATE		24
+#define XFS_ERRTAG_REFCOUNT_FINISH_ONE			25
+#define XFS_ERRTAG_BMAP_FINISH_ONE			26
+#define XFS_ERRTAG_AG_RESV_CRITICAL			27
+#define XFS_ERRTAG_MAX					28
 
 /*
  * Random factors for above tags, 1 means always, 2 means 1/2 time, etc.
@@ -121,6 +125,10 @@ extern void xfs_verifier_error(struct xfs_buf *bp);
 #define	XFS_RANDOM_BMAPIFORMAT				XFS_RANDOM_DEFAULT
 #define XFS_RANDOM_FREE_EXTENT				1
 #define XFS_RANDOM_RMAP_FINISH_ONE			1
+#define XFS_RANDOM_REFCOUNT_CONTINUE_UPDATE		1
+#define XFS_RANDOM_REFCOUNT_FINISH_ONE			1
+#define XFS_RANDOM_BMAP_FINISH_ONE			1
+#define XFS_RANDOM_AG_RESV_CRITICAL			4
 
 #ifdef DEBUG
 extern int xfs_error_test_active;
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 2bc58b3fd37d..a314fc7b56fa 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -38,6 +38,7 @@
 #include "xfs_icache.h"
 #include "xfs_pnfs.h"
 #include "xfs_iomap.h"
+#include "xfs_reflink.h"
 
 #include <linux/dcache.h>
 #include <linux/falloc.h>
@@ -634,6 +635,13 @@ xfs_file_dio_aio_write(
 
 	trace_xfs_file_direct_write(ip, count, iocb->ki_pos);
 
+	/* If this is a block-aligned directio CoW, remap immediately. */
+	if (xfs_is_reflink_inode(ip) && !unaligned_io) {
+		ret = xfs_reflink_allocate_cow_range(ip, iocb->ki_pos, count);
+		if (ret)
+			goto out;
+	}
+
 	data = *from;
 	ret = __blockdev_direct_IO(iocb, inode, target->bt_bdev, &data,
 			xfs_get_blocks_direct, xfs_end_io_direct_write,
@@ -735,6 +743,9 @@ write_retry:
 		enospc = xfs_inode_free_quota_eofblocks(ip);
 		if (enospc)
 			goto write_retry;
+		enospc = xfs_inode_free_quota_cowblocks(ip);
+		if (enospc)
+			goto write_retry;
 	} else if (ret == -ENOSPC && !enospc) {
 		struct xfs_eofblocks eofb = {0};
 
@@ -774,10 +785,20 @@ xfs_file_write_iter(
 
 	if (IS_DAX(inode))
 		ret = xfs_file_dax_write(iocb, from);
-	else if (iocb->ki_flags & IOCB_DIRECT)
+	else if (iocb->ki_flags & IOCB_DIRECT) {
+		/*
+		 * Allow a directio write to fall back to a buffered
+		 * write *only* in the case that we're doing a reflink
+		 * CoW.  In all other directio scenarios we do not
+		 * allow an operation to fall back to buffered mode.
+		 */
 		ret = xfs_file_dio_aio_write(iocb, from);
-	else
+		if (ret == -EREMCHG)
+			goto buffered;
+	} else {
+buffered:
 		ret = xfs_file_buffered_aio_write(iocb, from);
+	}
 
 	if (ret > 0) {
 		XFS_STATS_ADD(ip->i_mount, xs_write_bytes, ret);
@@ -791,7 +812,7 @@ xfs_file_write_iter(
 #define	XFS_FALLOC_FL_SUPPORTED						\
 		(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |		\
 		 FALLOC_FL_COLLAPSE_RANGE | FALLOC_FL_ZERO_RANGE |	\
-		 FALLOC_FL_INSERT_RANGE)
+		 FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE)
 
 STATIC long
 xfs_file_fallocate(
@@ -881,9 +902,15 @@ xfs_file_fallocate(
 
 		if (mode & FALLOC_FL_ZERO_RANGE)
 			error = xfs_zero_file_space(ip, offset, len);
-		else
+		else {
+			if (mode & FALLOC_FL_UNSHARE_RANGE) {
+				error = xfs_reflink_unshare(ip, offset, len);
+				if (error)
+					goto out_unlock;
+			}
 			error = xfs_alloc_file_space(ip, offset, len,
 						     XFS_BMAPI_PREALLOC);
+		}
 		if (error)
 			goto out_unlock;
 	}
@@ -920,6 +947,189 @@ out_unlock:
 	return error;
 }
 
+/*
+ * Flush all file writes out to disk.
+ */
+static int
+xfs_file_wait_for_io(
+	struct inode	*inode,
+	loff_t		offset,
+	size_t		len)
+{
+	loff_t		rounding;
+	loff_t		ioffset;
+	loff_t		iendoffset;
+	loff_t		bs;
+	int		ret;
+
+	bs = inode->i_sb->s_blocksize;
+	inode_dio_wait(inode);
+
+	rounding = max_t(xfs_off_t, bs, PAGE_SIZE);
+	ioffset = round_down(offset, rounding);
+	iendoffset = round_up(offset + len, rounding) - 1;
+	ret = filemap_write_and_wait_range(inode->i_mapping, ioffset,
+					   iendoffset);
+	return ret;
+}
+
+/* Hook up to the VFS reflink function */
+STATIC int
+xfs_file_share_range(
+	struct file	*file_in,
+	loff_t		pos_in,
+	struct file	*file_out,
+	loff_t		pos_out,
+	u64		len,
+	bool		is_dedupe)
+{
+	struct inode	*inode_in;
+	struct inode	*inode_out;
+	ssize_t		ret;
+	loff_t		bs;
+	loff_t		isize;
+	int		same_inode;
+	loff_t		blen;
+	unsigned int	flags = 0;
+
+	inode_in = file_inode(file_in);
+	inode_out = file_inode(file_out);
+	bs = inode_out->i_sb->s_blocksize;
+
+	/* Don't touch certain kinds of inodes */
+	if (IS_IMMUTABLE(inode_out))
+		return -EPERM;
+	if (IS_SWAPFILE(inode_in) ||
+	    IS_SWAPFILE(inode_out))
+		return -ETXTBSY;
+
+	/* Reflink only works within this filesystem. */
+	if (inode_in->i_sb != inode_out->i_sb)
+		return -EXDEV;
+	same_inode = (inode_in->i_ino == inode_out->i_ino);
+
+	/* Don't reflink dirs, pipes, sockets... */
+	if (S_ISDIR(inode_in->i_mode) || S_ISDIR(inode_out->i_mode))
+		return -EISDIR;
+	if (S_ISFIFO(inode_in->i_mode) || S_ISFIFO(inode_out->i_mode))
+		return -EINVAL;
+	if (!S_ISREG(inode_in->i_mode) || !S_ISREG(inode_out->i_mode))
+		return -EINVAL;
+
+	/* Don't share DAX file data for now. */
+	if (IS_DAX(inode_in) || IS_DAX(inode_out))
+		return -EINVAL;
+
+	/* Are we going all the way to the end? */
+	isize = i_size_read(inode_in);
+	if (isize == 0)
+		return 0;
+	if (len == 0)
+		len = isize - pos_in;
+
+	/* Ensure offsets don't wrap and the input is inside i_size */
+	if (pos_in + len < pos_in || pos_out + len < pos_out ||
+	    pos_in + len > isize)
+		return -EINVAL;
+
+	/* Don't allow dedupe past EOF in the dest file */
+	if (is_dedupe) {
+		loff_t	disize;
+
+		disize = i_size_read(inode_out);
+		if (pos_out >= disize || pos_out + len > disize)
+			return -EINVAL;
+	}
+
+	/* If we're linking to EOF, continue to the block boundary. */
+	if (pos_in + len == isize)
+		blen = ALIGN(isize, bs) - pos_in;
+	else
+		blen = len;
+
+	/* Only reflink if we're aligned to block boundaries */
+	if (!IS_ALIGNED(pos_in, bs) || !IS_ALIGNED(pos_in + blen, bs) ||
+	    !IS_ALIGNED(pos_out, bs) || !IS_ALIGNED(pos_out + blen, bs))
+		return -EINVAL;
+
+	/* Don't allow overlapped reflink within the same file */
+	if (same_inode && pos_out + blen > pos_in && pos_out < pos_in + blen)
+		return -EINVAL;
+
+	/* Wait for the completion of any pending IOs on srcfile */
+	ret = xfs_file_wait_for_io(inode_in, pos_in, len);
+	if (ret)
+		goto out;
+	ret = xfs_file_wait_for_io(inode_out, pos_out, len);
+	if (ret)
+		goto out;
+
+	if (is_dedupe)
+		flags |= XFS_REFLINK_DEDUPE;
+	ret = xfs_reflink_remap_range(XFS_I(inode_in), pos_in, XFS_I(inode_out),
+			pos_out, len, flags);
+	if (ret < 0)
+		goto out;
+
+out:
+	return ret;
+}
+
+STATIC ssize_t
+xfs_file_copy_range(
+	struct file	*file_in,
+	loff_t		pos_in,
+	struct file	*file_out,
+	loff_t		pos_out,
+	size_t		len,
+	unsigned int	flags)
+{
+	int		error;
+
+	error = xfs_file_share_range(file_in, pos_in, file_out, pos_out,
+				     len, false);
+	if (error)
+		return error;
+	return len;
+}
+
+STATIC int
+xfs_file_clone_range(
+	struct file	*file_in,
+	loff_t		pos_in,
+	struct file	*file_out,
+	loff_t		pos_out,
+	u64		len)
+{
+	return xfs_file_share_range(file_in, pos_in, file_out, pos_out,
+				     len, false);
+}
+
+#define XFS_MAX_DEDUPE_LEN	(16 * 1024 * 1024)
+STATIC ssize_t
+xfs_file_dedupe_range(
+	struct file	*src_file,
+	u64		loff,
+	u64		len,
+	struct file	*dst_file,
+	u64		dst_loff)
+{
+	int		error;
+
+	/*
+	 * Limit the total length we will dedupe for each operation.
+	 * This is intended to bound the total time spent in this
+	 * ioctl to something sane.
+	 */
+	if (len > XFS_MAX_DEDUPE_LEN)
+		len = XFS_MAX_DEDUPE_LEN;
+
+	error = xfs_file_share_range(src_file, loff, dst_file, dst_loff,
+				     len, true);
+	if (error)
+		return error;
+	return len;
+}
 
 STATIC int
 xfs_file_open(
@@ -1581,6 +1791,9 @@ const struct file_operations xfs_file_operations = {
 	.fsync		= xfs_file_fsync,
 	.get_unmapped_area = thp_get_unmapped_area,
 	.fallocate	= xfs_file_fallocate,
+	.copy_file_range = xfs_file_copy_range,
+	.clone_file_range = xfs_file_clone_range,
+	.dedupe_file_range = xfs_file_dedupe_range,
 };
 
 const struct file_operations xfs_dir_file_operations = {
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index 94ac06f3d908..93d12fa2670d 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -43,6 +43,7 @@
 #include "xfs_log.h"
 #include "xfs_filestream.h"
 #include "xfs_rmap.h"
+#include "xfs_ag_resv.h"
 
 /*
  * File system operations
@@ -108,7 +109,9 @@ xfs_fs_geometry(
 			(xfs_sb_version_hassparseinodes(&mp->m_sb) ?
 				XFS_FSOP_GEOM_FLAGS_SPINODES : 0) |
 			(xfs_sb_version_hasrmapbt(&mp->m_sb) ?
-				XFS_FSOP_GEOM_FLAGS_RMAPBT : 0);
+				XFS_FSOP_GEOM_FLAGS_RMAPBT : 0) |
+			(xfs_sb_version_hasreflink(&mp->m_sb) ?
+				XFS_FSOP_GEOM_FLAGS_REFLINK : 0);
 		geo->logsectsize = xfs_sb_version_hassector(&mp->m_sb) ?
 				mp->m_sb.sb_logsectsize : BBSIZE;
 		geo->rtsectsize = mp->m_sb.sb_blocksize;
@@ -259,6 +262,12 @@ xfs_growfs_data_private(
 		agf->agf_longest = cpu_to_be32(tmpsize);
 		if (xfs_sb_version_hascrc(&mp->m_sb))
 			uuid_copy(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid);
+		if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+			agf->agf_refcount_root = cpu_to_be32(
+					xfs_refc_block(mp));
+			agf->agf_refcount_level = cpu_to_be32(1);
+			agf->agf_refcount_blocks = cpu_to_be32(1);
+		}
 
 		error = xfs_bwrite(bp);
 		xfs_buf_relse(bp);
@@ -450,6 +459,17 @@ xfs_growfs_data_private(
 			rrec->rm_offset = 0;
 			be16_add_cpu(&block->bb_numrecs, 1);
 
+			/* account for refc btree root */
+			if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+				rrec = XFS_RMAP_REC_ADDR(block, 5);
+				rrec->rm_startblock = cpu_to_be32(
+						xfs_refc_block(mp));
+				rrec->rm_blockcount = cpu_to_be32(1);
+				rrec->rm_owner = cpu_to_be64(XFS_RMAP_OWN_REFC);
+				rrec->rm_offset = 0;
+				be16_add_cpu(&block->bb_numrecs, 1);
+			}
+
 			error = xfs_bwrite(bp);
 			xfs_buf_relse(bp);
 			if (error)
@@ -507,6 +527,28 @@ xfs_growfs_data_private(
 				goto error0;
 		}
 
+		/*
+		 * refcount btree root block
+		 */
+		if (xfs_sb_version_hasreflink(&mp->m_sb)) {
+			bp = xfs_growfs_get_hdr_buf(mp,
+				XFS_AGB_TO_DADDR(mp, agno, xfs_refc_block(mp)),
+				BTOBB(mp->m_sb.sb_blocksize), 0,
+				&xfs_refcountbt_buf_ops);
+			if (!bp) {
+				error = -ENOMEM;
+				goto error0;
+			}
+
+			xfs_btree_init_block(mp, bp, XFS_REFC_CRC_MAGIC,
+					     0, 0, agno,
+					     XFS_BTREE_CRC_BLOCKS);
+
+			error = xfs_bwrite(bp);
+			xfs_buf_relse(bp);
+			if (error)
+				goto error0;
+		}
 	}
 	xfs_trans_agblocks_delta(tp, nfree);
 	/*
@@ -589,6 +631,11 @@ xfs_growfs_data_private(
 	xfs_set_low_space_thresholds(mp);
 	mp->m_alloc_set_aside = xfs_alloc_set_aside(mp);
 
+	/* Reserve AG metadata blocks. */
+	error = xfs_fs_reserve_ag_blocks(mp);
+	if (error && error != -ENOSPC)
+		goto out;
+
 	/* update secondary superblocks. */
 	for (agno = 1; agno < nagcount; agno++) {
 		error = 0;
@@ -639,6 +686,8 @@ xfs_growfs_data_private(
 			continue;
 		}
 	}
+
+ out:
 	return saved_error ? saved_error : error;
 
  error0:
@@ -948,3 +997,59 @@ xfs_do_force_shutdown(
 	"Please umount the filesystem and rectify the problem(s)");
 	}
 }
+
+/*
+ * Reserve free space for per-AG metadata.
+ */
+int
+xfs_fs_reserve_ag_blocks(
+	struct xfs_mount	*mp)
+{
+	xfs_agnumber_t		agno;
+	struct xfs_perag	*pag;
+	int			error = 0;
+	int			err2;
+
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		pag = xfs_perag_get(mp, agno);
+		err2 = xfs_ag_resv_init(pag);
+		xfs_perag_put(pag);
+		if (err2 && !error)
+			error = err2;
+	}
+
+	if (error && error != -ENOSPC) {
+		xfs_warn(mp,
+	"Error %d reserving per-AG metadata reserve pool.", error);
+		xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+	}
+
+	return error;
+}
+
+/*
+ * Free space reserved for per-AG metadata.
+ */
+int
+xfs_fs_unreserve_ag_blocks(
+	struct xfs_mount	*mp)
+{
+	xfs_agnumber_t		agno;
+	struct xfs_perag	*pag;
+	int			error = 0;
+	int			err2;
+
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		pag = xfs_perag_get(mp, agno);
+		err2 = xfs_ag_resv_free(pag);
+		xfs_perag_put(pag);
+		if (err2 && !error)
+			error = err2;
+	}
+
+	if (error)
+		xfs_warn(mp,
+	"Error %d freeing per-AG metadata reserve pool.", error);
+
+	return error;
+}
diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h
index f32713f14f9a..f34915898fea 100644
--- a/fs/xfs/xfs_fsops.h
+++ b/fs/xfs/xfs_fsops.h
@@ -26,4 +26,7 @@ extern int xfs_reserve_blocks(xfs_mount_t *mp, __uint64_t *inval,
 				xfs_fsop_resblks_t *outval);
 extern int xfs_fs_goingdown(xfs_mount_t *mp, __uint32_t inflags);
 
+extern int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp);
+extern int xfs_fs_unreserve_ag_blocks(struct xfs_mount *mp);
+
 #endif	/* __XFS_FSOPS_H__ */
diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c
index 4d41b241298f..687a4b01fc53 100644
--- a/fs/xfs/xfs_globals.c
+++ b/fs/xfs/xfs_globals.c
@@ -21,8 +21,8 @@
 /*
  * Tunable XFS parameters.  xfs_params is required even when CONFIG_SYSCTL=n,
  * other XFS code uses these values.  Times are measured in centisecs (i.e.
- * 100ths of a second) with the exception of eofb_timer, which is measured in
- * seconds.
+ * 100ths of a second) with the exception of eofb_timer and cowb_timer, which
+ * are measured in seconds.
  */
 xfs_param_t xfs_params = {
 			  /*	MIN		DFLT		MAX	*/
@@ -42,6 +42,7 @@ xfs_param_t xfs_params = {
 	.inherit_nodfrg	= {	0,		1,		1	},
 	.fstrm_timer	= {	1,		30*100,		3600*100},
 	.eofb_timer	= {	1,		300,		3600*24},
+	.cowb_timer	= {	1,		1800,		3600*24},
 };
 
 struct xfs_globals xfs_globals = {
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index 65b2e3f85f52..14796b744e0a 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -33,6 +33,7 @@
 #include "xfs_bmap_util.h"
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
+#include "xfs_reflink.h"
 
 #include <linux/kthread.h>
 #include <linux/freezer.h>
@@ -76,6 +77,9 @@ xfs_inode_alloc(
 	ip->i_mount = mp;
 	memset(&ip->i_imap, 0, sizeof(struct xfs_imap));
 	ip->i_afp = NULL;
+	ip->i_cowfp = NULL;
+	ip->i_cnextents = 0;
+	ip->i_cformat = XFS_DINODE_FMT_EXTENTS;
 	memset(&ip->i_df, 0, sizeof(xfs_ifork_t));
 	ip->i_flags = 0;
 	ip->i_delayed_blks = 0;
@@ -101,6 +105,8 @@ xfs_inode_free_callback(
 
 	if (ip->i_afp)
 		xfs_idestroy_fork(ip, XFS_ATTR_FORK);
+	if (ip->i_cowfp)
+		xfs_idestroy_fork(ip, XFS_COW_FORK);
 
 	if (ip->i_itemp) {
 		ASSERT(!(ip->i_itemp->ili_item.li_flags & XFS_LI_IN_AIL));
@@ -787,6 +793,33 @@ xfs_eofblocks_worker(
 	xfs_queue_eofblocks(mp);
 }
 
+/*
+ * Background scanning to trim preallocated CoW space. This is queued
+ * based on the 'speculative_cow_prealloc_lifetime' tunable (5m by default).
+ * (We'll just piggyback on the post-EOF prealloc space workqueue.)
+ */
+STATIC void
+xfs_queue_cowblocks(
+	struct xfs_mount *mp)
+{
+	rcu_read_lock();
+	if (radix_tree_tagged(&mp->m_perag_tree, XFS_ICI_COWBLOCKS_TAG))
+		queue_delayed_work(mp->m_eofblocks_workqueue,
+				   &mp->m_cowblocks_work,
+				   msecs_to_jiffies(xfs_cowb_secs * 1000));
+	rcu_read_unlock();
+}
+
+void
+xfs_cowblocks_worker(
+	struct work_struct *work)
+{
+	struct xfs_mount *mp = container_of(to_delayed_work(work),
+				struct xfs_mount, m_cowblocks_work);
+	xfs_icache_free_cowblocks(mp, NULL);
+	xfs_queue_cowblocks(mp);
+}
+
 int
 xfs_inode_ag_iterator(
 	struct xfs_mount	*mp,
@@ -1343,18 +1376,30 @@ xfs_inode_free_eofblocks(
 	return ret;
 }
 
-int
-xfs_icache_free_eofblocks(
+static int
+__xfs_icache_free_eofblocks(
 	struct xfs_mount	*mp,
-	struct xfs_eofblocks	*eofb)
+	struct xfs_eofblocks	*eofb,
+	int			(*execute)(struct xfs_inode *ip, int flags,
+					   void *args),
+	int			tag)
 {
 	int flags = SYNC_TRYLOCK;
 
 	if (eofb && (eofb->eof_flags & XFS_EOF_FLAGS_SYNC))
 		flags = SYNC_WAIT;
 
-	return xfs_inode_ag_iterator_tag(mp, xfs_inode_free_eofblocks, flags,
-					 eofb, XFS_ICI_EOFBLOCKS_TAG);
+	return xfs_inode_ag_iterator_tag(mp, execute, flags,
+					 eofb, tag);
+}
+
+int
+xfs_icache_free_eofblocks(
+	struct xfs_mount	*mp,
+	struct xfs_eofblocks	*eofb)
+{
+	return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_eofblocks,
+			XFS_ICI_EOFBLOCKS_TAG);
 }
 
 /*
@@ -1363,9 +1408,11 @@ xfs_icache_free_eofblocks(
  * failure. We make a best effort by including each quota under low free space
  * conditions (less than 1% free space) in the scan.
  */
-int
-xfs_inode_free_quota_eofblocks(
-	struct xfs_inode *ip)
+static int
+__xfs_inode_free_quota_eofblocks(
+	struct xfs_inode	*ip,
+	int			(*execute)(struct xfs_mount *mp,
+					   struct xfs_eofblocks	*eofb))
 {
 	int scan = 0;
 	struct xfs_eofblocks eofb = {0};
@@ -1401,14 +1448,25 @@ xfs_inode_free_quota_eofblocks(
 	}
 
 	if (scan)
-		xfs_icache_free_eofblocks(ip->i_mount, &eofb);
+		execute(ip->i_mount, &eofb);
 
 	return scan;
 }
 
-void
-xfs_inode_set_eofblocks_tag(
-	xfs_inode_t	*ip)
+int
+xfs_inode_free_quota_eofblocks(
+	struct xfs_inode *ip)
+{
+	return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_eofblocks);
+}
+
+static void
+__xfs_inode_set_eofblocks_tag(
+	xfs_inode_t	*ip,
+	void		(*execute)(struct xfs_mount *mp),
+	void		(*set_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
+				  int error, unsigned long caller_ip),
+	int		tag)
 {
 	struct xfs_mount *mp = ip->i_mount;
 	struct xfs_perag *pag;
@@ -1426,26 +1484,22 @@ xfs_inode_set_eofblocks_tag(
 
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 	spin_lock(&pag->pag_ici_lock);
-	trace_xfs_inode_set_eofblocks_tag(ip);
 
-	tagged = radix_tree_tagged(&pag->pag_ici_root,
-				   XFS_ICI_EOFBLOCKS_TAG);
+	tagged = radix_tree_tagged(&pag->pag_ici_root, tag);
 	radix_tree_tag_set(&pag->pag_ici_root,
-			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
-			   XFS_ICI_EOFBLOCKS_TAG);
+			   XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag);
 	if (!tagged) {
 		/* propagate the eofblocks tag up into the perag radix tree */
 		spin_lock(&ip->i_mount->m_perag_lock);
 		radix_tree_tag_set(&ip->i_mount->m_perag_tree,
 				   XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-				   XFS_ICI_EOFBLOCKS_TAG);
+				   tag);
 		spin_unlock(&ip->i_mount->m_perag_lock);
 
 		/* kick off background trimming */
-		xfs_queue_eofblocks(ip->i_mount);
+		execute(ip->i_mount);
 
-		trace_xfs_perag_set_eofblocks(ip->i_mount, pag->pag_agno,
-					      -1, _RET_IP_);
+		set_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);
 	}
 
 	spin_unlock(&pag->pag_ici_lock);
@@ -1453,9 +1507,22 @@ xfs_inode_set_eofblocks_tag(
 }
 
 void
-xfs_inode_clear_eofblocks_tag(
+xfs_inode_set_eofblocks_tag(
 	xfs_inode_t	*ip)
 {
+	trace_xfs_inode_set_eofblocks_tag(ip);
+	return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_eofblocks,
+			trace_xfs_perag_set_eofblocks,
+			XFS_ICI_EOFBLOCKS_TAG);
+}
+
+static void
+__xfs_inode_clear_eofblocks_tag(
+	xfs_inode_t	*ip,
+	void		(*clear_tp)(struct xfs_mount *mp, xfs_agnumber_t agno,
+				    int error, unsigned long caller_ip),
+	int		tag)
+{
 	struct xfs_mount *mp = ip->i_mount;
 	struct xfs_perag *pag;
 
@@ -1465,23 +1532,141 @@ xfs_inode_clear_eofblocks_tag(
 
 	pag = xfs_perag_get(mp, XFS_INO_TO_AGNO(mp, ip->i_ino));
 	spin_lock(&pag->pag_ici_lock);
-	trace_xfs_inode_clear_eofblocks_tag(ip);
 
 	radix_tree_tag_clear(&pag->pag_ici_root,
-			     XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino),
-			     XFS_ICI_EOFBLOCKS_TAG);
-	if (!radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_EOFBLOCKS_TAG)) {
+			     XFS_INO_TO_AGINO(ip->i_mount, ip->i_ino), tag);
+	if (!radix_tree_tagged(&pag->pag_ici_root, tag)) {
 		/* clear the eofblocks tag from the perag radix tree */
 		spin_lock(&ip->i_mount->m_perag_lock);
 		radix_tree_tag_clear(&ip->i_mount->m_perag_tree,
 				     XFS_INO_TO_AGNO(ip->i_mount, ip->i_ino),
-				     XFS_ICI_EOFBLOCKS_TAG);
+				     tag);
 		spin_unlock(&ip->i_mount->m_perag_lock);
-		trace_xfs_perag_clear_eofblocks(ip->i_mount, pag->pag_agno,
-					       -1, _RET_IP_);
+		clear_tp(ip->i_mount, pag->pag_agno, -1, _RET_IP_);
 	}
 
 	spin_unlock(&pag->pag_ici_lock);
 	xfs_perag_put(pag);
 }
 
+void
+xfs_inode_clear_eofblocks_tag(
+	xfs_inode_t	*ip)
+{
+	trace_xfs_inode_clear_eofblocks_tag(ip);
+	return __xfs_inode_clear_eofblocks_tag(ip,
+			trace_xfs_perag_clear_eofblocks, XFS_ICI_EOFBLOCKS_TAG);
+}
+
+/*
+ * Automatic CoW Reservation Freeing
+ *
+ * These functions automatically garbage collect leftover CoW reservations
+ * that were made on behalf of a cowextsize hint when we start to run out
+ * of quota or when the reservations sit around for too long.  If the file
+ * has dirty pages or is undergoing writeback, its CoW reservations will
+ * be retained.
+ *
+ * The actual garbage collection piggybacks off the same code that runs
+ * the speculative EOF preallocation garbage collector.
+ */
+STATIC int
+xfs_inode_free_cowblocks(
+	struct xfs_inode	*ip,
+	int			flags,
+	void			*args)
+{
+	int ret;
+	struct xfs_eofblocks *eofb = args;
+	bool need_iolock = true;
+	int match;
+
+	ASSERT(!eofb || (eofb && eofb->eof_scan_owner != 0));
+
+	if (!xfs_reflink_has_real_cow_blocks(ip)) {
+		trace_xfs_inode_free_cowblocks_invalid(ip);
+		xfs_inode_clear_cowblocks_tag(ip);
+		return 0;
+	}
+
+	/*
+	 * If the mapping is dirty or under writeback we cannot touch the
+	 * CoW fork.  Leave it alone if we're in the midst of a directio.
+	 */
+	if (mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_DIRTY) ||
+	    mapping_tagged(VFS_I(ip)->i_mapping, PAGECACHE_TAG_WRITEBACK) ||
+	    atomic_read(&VFS_I(ip)->i_dio_count))
+		return 0;
+
+	if (eofb) {
+		if (eofb->eof_flags & XFS_EOF_FLAGS_UNION)
+			match = xfs_inode_match_id_union(ip, eofb);
+		else
+			match = xfs_inode_match_id(ip, eofb);
+		if (!match)
+			return 0;
+
+		/* skip the inode if the file size is too small */
+		if (eofb->eof_flags & XFS_EOF_FLAGS_MINFILESIZE &&
+		    XFS_ISIZE(ip) < eofb->eof_min_file_size)
+			return 0;
+
+		/*
+		 * A scan owner implies we already hold the iolock. Skip it in
+		 * xfs_free_eofblocks() to avoid deadlock. This also eliminates
+		 * the possibility of EAGAIN being returned.
+		 */
+		if (eofb->eof_scan_owner == ip->i_ino)
+			need_iolock = false;
+	}
+
+	/* Free the CoW blocks */
+	if (need_iolock) {
+		xfs_ilock(ip, XFS_IOLOCK_EXCL);
+		xfs_ilock(ip, XFS_MMAPLOCK_EXCL);
+	}
+
+	ret = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
+
+	if (need_iolock) {
+		xfs_iunlock(ip, XFS_MMAPLOCK_EXCL);
+		xfs_iunlock(ip, XFS_IOLOCK_EXCL);
+	}
+
+	return ret;
+}
+
+int
+xfs_icache_free_cowblocks(
+	struct xfs_mount	*mp,
+	struct xfs_eofblocks	*eofb)
+{
+	return __xfs_icache_free_eofblocks(mp, eofb, xfs_inode_free_cowblocks,
+			XFS_ICI_COWBLOCKS_TAG);
+}
+
+int
+xfs_inode_free_quota_cowblocks(
+	struct xfs_inode *ip)
+{
+	return __xfs_inode_free_quota_eofblocks(ip, xfs_icache_free_cowblocks);
+}
+
+void
+xfs_inode_set_cowblocks_tag(
+	xfs_inode_t	*ip)
+{
+	trace_xfs_inode_set_eofblocks_tag(ip);
+	return __xfs_inode_set_eofblocks_tag(ip, xfs_queue_cowblocks,
+			trace_xfs_perag_set_eofblocks,
+			XFS_ICI_COWBLOCKS_TAG);
+}
+
+void
+xfs_inode_clear_cowblocks_tag(
+	xfs_inode_t	*ip)
+{
+	trace_xfs_inode_clear_eofblocks_tag(ip);
+	return __xfs_inode_clear_eofblocks_tag(ip,
+			trace_xfs_perag_clear_eofblocks, XFS_ICI_COWBLOCKS_TAG);
+}
diff --git a/fs/xfs/xfs_icache.h b/fs/xfs/xfs_icache.h
index 05bac99bef75..a1e02f4708ab 100644
--- a/fs/xfs/xfs_icache.h
+++ b/fs/xfs/xfs_icache.h
@@ -40,6 +40,7 @@ struct xfs_eofblocks {
 					   in xfs_inode_ag_iterator */
 #define XFS_ICI_RECLAIM_TAG	0	/* inode is to be reclaimed */
 #define XFS_ICI_EOFBLOCKS_TAG	1	/* inode has blocks beyond EOF */
+#define XFS_ICI_COWBLOCKS_TAG	2	/* inode can have cow blocks to gc */
 
 /*
  * Flags for xfs_iget()
@@ -70,6 +71,12 @@ int xfs_inode_free_quota_eofblocks(struct xfs_inode *ip);
 void xfs_eofblocks_worker(struct work_struct *);
 void xfs_queue_eofblocks(struct xfs_mount *);
 
+void xfs_inode_set_cowblocks_tag(struct xfs_inode *ip);
+void xfs_inode_clear_cowblocks_tag(struct xfs_inode *ip);
+int xfs_icache_free_cowblocks(struct xfs_mount *, struct xfs_eofblocks *);
+int xfs_inode_free_quota_cowblocks(struct xfs_inode *ip);
+void xfs_cowblocks_worker(struct work_struct *);
+
 int xfs_inode_ag_iterator(struct xfs_mount *mp,
 	int (*execute)(struct xfs_inode *ip, int flags, void *args),
 	int flags, void *args);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 624e1dfa716b..4e560e6a12c1 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -49,6 +49,7 @@
 #include "xfs_trans_priv.h"
 #include "xfs_log.h"
 #include "xfs_bmap_btree.h"
+#include "xfs_reflink.h"
 
 kmem_zone_t *xfs_inode_zone;
 
@@ -77,6 +78,29 @@ xfs_get_extsz_hint(
 }
 
 /*
+ * Helper function to extract CoW extent size hint from inode.
+ * Between the extent size hint and the CoW extent size hint, we
+ * return the greater of the two.  If the value is zero (automatic),
+ * use the default size.
+ */
+xfs_extlen_t
+xfs_get_cowextsz_hint(
+	struct xfs_inode	*ip)
+{
+	xfs_extlen_t		a, b;
+
+	a = 0;
+	if (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
+		a = ip->i_d.di_cowextsize;
+	b = xfs_get_extsz_hint(ip);
+
+	a = max(a, b);
+	if (a == 0)
+		return XFS_DEFAULT_COWEXTSZ_HINT;
+	return a;
+}
+
+/*
  * These two are wrapper routines around the xfs_ilock() routine used to
  * centralize some grungy code.  They are used in places that wish to lock the
  * inode solely for reading the extents.  The reason these places can't just
@@ -651,6 +675,8 @@ _xfs_dic2xflags(
 	if (di_flags2 & XFS_DIFLAG2_ANY) {
 		if (di_flags2 & XFS_DIFLAG2_DAX)
 			flags |= FS_XFLAG_DAX;
+		if (di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
+			flags |= FS_XFLAG_COWEXTSIZE;
 	}
 
 	if (has_attr)
@@ -834,6 +860,7 @@ xfs_ialloc(
 	if (ip->i_d.di_version == 3) {
 		inode->i_version = 1;
 		ip->i_d.di_flags2 = 0;
+		ip->i_d.di_cowextsize = 0;
 		ip->i_d.di_crtime.t_sec = (__int32_t)tv.tv_sec;
 		ip->i_d.di_crtime.t_nsec = (__int32_t)tv.tv_nsec;
 	}
@@ -896,6 +923,15 @@ xfs_ialloc(
 			ip->i_d.di_flags |= di_flags;
 			ip->i_d.di_flags2 |= di_flags2;
 		}
+		if (pip &&
+		    (pip->i_d.di_flags2 & XFS_DIFLAG2_ANY) &&
+		    pip->i_d.di_version == 3 &&
+		    ip->i_d.di_version == 3) {
+			if (pip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) {
+				ip->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+				ip->i_d.di_cowextsize = pip->i_d.di_cowextsize;
+			}
+		}
 		/* FALLTHROUGH */
 	case S_IFLNK:
 		ip->i_d.di_format = XFS_DINODE_FMT_EXTENTS;
@@ -1586,6 +1622,20 @@ xfs_itruncate_extents(
 			goto out;
 	}
 
+	/* Remove all pending CoW reservations. */
+	error = xfs_reflink_cancel_cow_blocks(ip, &tp, first_unmap_block,
+			last_block);
+	if (error)
+		goto out;
+
+	/*
+	 * Clear the reflink flag if we truncated everything.
+	 */
+	if (ip->i_d.di_nblocks == 0 && xfs_is_reflink_inode(ip)) {
+		ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+		xfs_inode_clear_cowblocks_tag(ip);
+	}
+
 	/*
 	 * Always re-log the inode so that our permanent transaction can keep
 	 * on rolling it forward in the log.
@@ -1850,6 +1900,7 @@ xfs_inactive(
 	}
 
 	mp = ip->i_mount;
+	ASSERT(!xfs_iflags_test(ip, XFS_IRECOVERY));
 
 	/* If this is a read-only mount, don't do this (would generate I/O) */
 	if (mp->m_flags & XFS_MOUNT_RDONLY)
diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h
index 8f30d2533b48..f14c1de2549d 100644
--- a/fs/xfs/xfs_inode.h
+++ b/fs/xfs/xfs_inode.h
@@ -47,6 +47,7 @@ typedef struct xfs_inode {
 
 	/* Extent information. */
 	xfs_ifork_t		*i_afp;		/* attribute fork pointer */
+	xfs_ifork_t		*i_cowfp;	/* copy on write extents */
 	xfs_ifork_t		i_df;		/* data fork */
 
 	/* operations vectors */
@@ -65,6 +66,9 @@ typedef struct xfs_inode {
 
 	struct xfs_icdinode	i_d;		/* most of ondisk inode */
 
+	xfs_extnum_t		i_cnextents;	/* # of extents in cow fork */
+	unsigned int		i_cformat;	/* format of cow fork */
+
 	/* VFS inode */
 	struct inode		i_vnode;	/* embedded VFS inode */
 } xfs_inode_t;
@@ -202,6 +206,11 @@ xfs_get_initial_prid(struct xfs_inode *dp)
 	return XFS_PROJID_DEFAULT;
 }
 
+static inline bool xfs_is_reflink_inode(struct xfs_inode *ip)
+{
+	return ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK;
+}
+
 /*
  * In-core inode flags.
  */
@@ -217,6 +226,12 @@ xfs_get_initial_prid(struct xfs_inode *dp)
 #define XFS_IPINNED		(1 << __XFS_IPINNED_BIT)
 #define XFS_IDONTCACHE		(1 << 9) /* don't cache the inode long term */
 #define XFS_IEOFBLOCKS		(1 << 10)/* has the preallocblocks tag set */
+/*
+ * If this unlinked inode is in the middle of recovery, don't let drop_inode
+ * truncate and free the inode.  This can happen if we iget the inode during
+ * log recovery to replay a bmap operation on the inode.
+ */
+#define XFS_IRECOVERY		(1 << 11)
 
 /*
  * Per-lifetime flags need to be reset when re-using a reclaimable inode during
@@ -411,6 +426,7 @@ int		xfs_iflush(struct xfs_inode *, struct xfs_buf **);
 void		xfs_lock_two_inodes(xfs_inode_t *, xfs_inode_t *, uint);
 
 xfs_extlen_t	xfs_get_extsz_hint(struct xfs_inode *ip);
+xfs_extlen_t	xfs_get_cowextsz_hint(struct xfs_inode *ip);
 
 int		xfs_dir_ialloc(struct xfs_trans **, struct xfs_inode *, umode_t,
 			       xfs_nlink_t, xfs_dev_t, prid_t, int,
@@ -474,4 +490,7 @@ do { \
 
 extern struct kmem_zone	*xfs_inode_zone;
 
+/* The default CoW extent size hint. */
+#define XFS_DEFAULT_COWEXTSZ_HINT 32
+
 #endif	/* __XFS_INODE_H__ */
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 892c2aced207..9610e9c00952 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -368,7 +368,7 @@ xfs_inode_to_log_dinode(
 		to->di_crtime.t_sec = from->di_crtime.t_sec;
 		to->di_crtime.t_nsec = from->di_crtime.t_nsec;
 		to->di_flags2 = from->di_flags2;
-
+		to->di_cowextsize = from->di_cowextsize;
 		to->di_ino = ip->i_ino;
 		to->di_lsn = lsn;
 		memset(to->di_pad2, 0, sizeof(to->di_pad2));
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 0d9021f0551e..c245bed3249b 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -903,6 +903,8 @@ xfs_ioc_fsgetxattr(
 	xfs_ilock(ip, XFS_ILOCK_SHARED);
 	fa.fsx_xflags = xfs_ip2xflags(ip);
 	fa.fsx_extsize = ip->i_d.di_extsize << ip->i_mount->m_sb.sb_blocklog;
+	fa.fsx_cowextsize = ip->i_d.di_cowextsize <<
+			ip->i_mount->m_sb.sb_blocklog;
 	fa.fsx_projid = xfs_get_projid(ip);
 
 	if (attr) {
@@ -973,12 +975,13 @@ xfs_set_diflags(
 	if (ip->i_d.di_version < 3)
 		return;
 
-	di_flags2 = 0;
+	di_flags2 = (ip->i_d.di_flags2 & XFS_DIFLAG2_REFLINK);
 	if (xflags & FS_XFLAG_DAX)
 		di_flags2 |= XFS_DIFLAG2_DAX;
+	if (xflags & FS_XFLAG_COWEXTSIZE)
+		di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
 
 	ip->i_d.di_flags2 = di_flags2;
-
 }
 
 STATIC void
@@ -1031,6 +1034,14 @@ xfs_ioctl_setattr_xflags(
 			return -EINVAL;
 	}
 
+	/* Clear reflink if we are actually able to set the rt flag. */
+	if ((fa->fsx_xflags & FS_XFLAG_REALTIME) && xfs_is_reflink_inode(ip))
+		ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+
+	/* Don't allow us to set DAX mode for a reflinked file for now. */
+	if ((fa->fsx_xflags & FS_XFLAG_DAX) && xfs_is_reflink_inode(ip))
+		return -EINVAL;
+
 	/*
 	 * Can't modify an immutable/append-only file unless
 	 * we have appropriate permission.
@@ -1219,6 +1230,56 @@ xfs_ioctl_setattr_check_extsize(
 	return 0;
 }
 
+/*
+ * CoW extent size hint validation rules are:
+ *
+ * 1. CoW extent size hint can only be set if reflink is enabled on the fs.
+ *    The inode does not have to have any shared blocks, but it must be a v3.
+ * 2. FS_XFLAG_COWEXTSIZE is only valid for directories and regular files;
+ *    for a directory, the hint is propagated to new files.
+ * 3. Can be changed on files & directories at any time.
+ * 4. CoW extsize hint of 0 turns off hints, clears inode flags.
+ * 5. Extent size must be a multiple of the appropriate block size.
+ * 6. The extent size hint must be limited to half the AG size to avoid
+ *    alignment extending the extent beyond the limits of the AG.
+ */
+static int
+xfs_ioctl_setattr_check_cowextsize(
+	struct xfs_inode	*ip,
+	struct fsxattr		*fa)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+
+	if (!(fa->fsx_xflags & FS_XFLAG_COWEXTSIZE))
+		return 0;
+
+	if (!xfs_sb_version_hasreflink(&ip->i_mount->m_sb) ||
+	    ip->i_d.di_version != 3)
+		return -EINVAL;
+
+	if (!S_ISREG(VFS_I(ip)->i_mode) && !S_ISDIR(VFS_I(ip)->i_mode))
+		return -EINVAL;
+
+	if (fa->fsx_cowextsize != 0) {
+		xfs_extlen_t    size;
+		xfs_fsblock_t   cowextsize_fsb;
+
+		cowextsize_fsb = XFS_B_TO_FSB(mp, fa->fsx_cowextsize);
+		if (cowextsize_fsb > MAXEXTLEN)
+			return -EINVAL;
+
+		size = mp->m_sb.sb_blocksize;
+		if (cowextsize_fsb > mp->m_sb.sb_agblocks / 2)
+			return -EINVAL;
+
+		if (fa->fsx_cowextsize % size)
+			return -EINVAL;
+	} else
+		fa->fsx_xflags &= ~FS_XFLAG_COWEXTSIZE;
+
+	return 0;
+}
+
 static int
 xfs_ioctl_setattr_check_projid(
 	struct xfs_inode	*ip,
@@ -1311,6 +1372,10 @@ xfs_ioctl_setattr(
 	if (code)
 		goto error_trans_cancel;
 
+	code = xfs_ioctl_setattr_check_cowextsize(ip, fa);
+	if (code)
+		goto error_trans_cancel;
+
 	code = xfs_ioctl_setattr_xflags(tp, ip, fa);
 	if (code)
 		goto error_trans_cancel;
@@ -1346,6 +1411,12 @@ xfs_ioctl_setattr(
 		ip->i_d.di_extsize = fa->fsx_extsize >> mp->m_sb.sb_blocklog;
 	else
 		ip->i_d.di_extsize = 0;
+	if (ip->i_d.di_version == 3 &&
+	    (ip->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
+		ip->i_d.di_cowextsize = fa->fsx_cowextsize >>
+				mp->m_sb.sb_blocklog;
+	else
+		ip->i_d.di_cowextsize = 0;
 
 	code = xfs_trans_commit(tp);
 
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index c08253e11545..d907eb9f8ef3 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -39,6 +39,7 @@
 #include "xfs_quota.h"
 #include "xfs_dquot_item.h"
 #include "xfs_dquot.h"
+#include "xfs_reflink.h"
 
 
 #define XFS_WRITEIO_ALIGN(mp,off)	(((off) >> mp->m_writeio_log) \
@@ -70,7 +71,7 @@ xfs_bmbt_to_iomap(
 	iomap->bdev = xfs_find_bdev_for_inode(VFS_I(ip));
 }
 
-static xfs_extlen_t
+xfs_extlen_t
 xfs_eof_alignment(
 	struct xfs_inode	*ip,
 	xfs_extlen_t		extsize)
@@ -609,7 +610,7 @@ xfs_file_iomap_begin_delay(
 	}
 
 retry:
-	error = xfs_bmapi_reserve_delalloc(ip, offset_fsb,
+	error = xfs_bmapi_reserve_delalloc(ip, XFS_DATA_FORK, offset_fsb,
 			end_fsb - offset_fsb, &got,
 			&prev, &idx, eof);
 	switch (error) {
@@ -666,6 +667,7 @@ out_unlock:
 int
 xfs_iomap_write_allocate(
 	xfs_inode_t	*ip,
+	int		whichfork,
 	xfs_off_t	offset,
 	xfs_bmbt_irec_t *imap)
 {
@@ -678,8 +680,12 @@ xfs_iomap_write_allocate(
 	xfs_trans_t	*tp;
 	int		nimaps;
 	int		error = 0;
+	int		flags = 0;
 	int		nres;
 
+	if (whichfork == XFS_COW_FORK)
+		flags |= XFS_BMAPI_COWFORK;
+
 	/*
 	 * Make sure that the dquots are there.
 	 */
@@ -773,7 +779,7 @@ xfs_iomap_write_allocate(
 			 * pointer that the caller gave to us.
 			 */
 			error = xfs_bmapi_write(tp, ip, map_start_fsb,
-						count_fsb, 0, &first_block,
+						count_fsb, flags, &first_block,
 						nres, imap, &nimaps,
 						&dfops);
 			if (error)
@@ -955,14 +961,22 @@ xfs_file_iomap_begin(
 	struct xfs_mount	*mp = ip->i_mount;
 	struct xfs_bmbt_irec	imap;
 	xfs_fileoff_t		offset_fsb, end_fsb;
+	bool			shared, trimmed;
 	int			nimaps = 1, error = 0;
 	unsigned		lockmode;
 
 	if (XFS_FORCED_SHUTDOWN(mp))
 		return -EIO;
 
-	if ((flags & IOMAP_WRITE) &&
-	    !IS_DAX(inode) && !xfs_get_extsz_hint(ip)) {
+	if ((flags & (IOMAP_WRITE | IOMAP_ZERO)) && xfs_is_reflink_inode(ip)) {
+		error = xfs_reflink_reserve_cow_range(ip, offset, length);
+		if (error < 0)
+			return error;
+	}
+
+	if ((flags & IOMAP_WRITE) && !IS_DAX(inode) &&
+		   !xfs_get_extsz_hint(ip)) {
+		/* Reserve delalloc blocks for regular writeback. */
 		return xfs_file_iomap_begin_delay(inode, offset, length, flags,
 				iomap);
 	}
@@ -976,7 +990,14 @@ xfs_file_iomap_begin(
 	end_fsb = XFS_B_TO_FSB(mp, offset + length);
 
 	error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb, &imap,
-			       &nimaps, XFS_BMAPI_ENTIRE);
+			       &nimaps, 0);
+	if (error) {
+		xfs_iunlock(ip, lockmode);
+		return error;
+	}
+
+	/* Trim the mapping to the nearest shared extent boundary. */
+	error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed);
 	if (error) {
 		xfs_iunlock(ip, lockmode);
 		return error;
@@ -1015,6 +1036,8 @@ xfs_file_iomap_begin(
 	}
 
 	xfs_bmbt_to_iomap(ip, iomap, &imap);
+	if (shared)
+		iomap->flags |= IOMAP_F_SHARED;
 	return 0;
 }
 
diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h
index 6498be485932..6d45cf01fcff 100644
--- a/fs/xfs/xfs_iomap.h
+++ b/fs/xfs/xfs_iomap.h
@@ -25,12 +25,13 @@ struct xfs_bmbt_irec;
 
 int xfs_iomap_write_direct(struct xfs_inode *, xfs_off_t, size_t,
 			struct xfs_bmbt_irec *, int);
-int xfs_iomap_write_allocate(struct xfs_inode *, xfs_off_t,
+int xfs_iomap_write_allocate(struct xfs_inode *, int, xfs_off_t,
 			struct xfs_bmbt_irec *);
 int xfs_iomap_write_unwritten(struct xfs_inode *, xfs_off_t, xfs_off_t);
 
 void xfs_bmbt_to_iomap(struct xfs_inode *, struct iomap *,
 		struct xfs_bmbt_irec *);
+xfs_extlen_t xfs_eof_alignment(struct xfs_inode *ip, xfs_extlen_t extsize);
 
 extern struct iomap_ops xfs_iomap_ops;
 extern struct iomap_ops xfs_xattr_iomap_ops;
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index c5da95eb79b8..405a65cd9d6b 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1159,6 +1159,7 @@ xfs_diflags_to_iflags(
 		inode->i_flags |= S_NOATIME;
 	if (S_ISREG(inode->i_mode) &&
 	    ip->i_mount->m_sb.sb_blocksize == PAGE_SIZE &&
+	    !xfs_is_reflink_inode(ip) &&
 	    (ip->i_mount->m_flags & XFS_MOUNT_DAX ||
 	     ip->i_d.di_flags2 & XFS_DIFLAG2_DAX))
 		inode->i_flags |= S_DAX;
diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c
index ce73eb34620d..66e881790c17 100644
--- a/fs/xfs/xfs_itable.c
+++ b/fs/xfs/xfs_itable.c
@@ -66,7 +66,7 @@ xfs_bulkstat_one_int(
 	if (!buffer || xfs_internal_inum(mp, ino))
 		return -EINVAL;
 
-	buf = kmem_alloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);
+	buf = kmem_zalloc(sizeof(*buf), KM_SLEEP | KM_MAYFAIL);
 	if (!buf)
 		return -ENOMEM;
 
@@ -111,6 +111,12 @@ xfs_bulkstat_one_int(
 	buf->bs_aextents = dic->di_anextents;
 	buf->bs_forkoff = XFS_IFORK_BOFF(ip);
 
+	if (dic->di_version == 3) {
+		if (dic->di_flags2 & XFS_DIFLAG2_COWEXTSIZE)
+			buf->bs_cowextsize = dic->di_cowextsize <<
+					mp->m_sb.sb_blocklog;
+	}
+
 	switch (dic->di_format) {
 	case XFS_DINODE_FMT_DEV:
 		buf->bs_rdev = ip->i_df.if_u2.if_rdev;
diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h
index b8d64d520e12..68640fb63a54 100644
--- a/fs/xfs/xfs_linux.h
+++ b/fs/xfs/xfs_linux.h
@@ -116,6 +116,7 @@ typedef __u32			xfs_nlink_t;
 #define xfs_inherit_nodefrag	xfs_params.inherit_nodfrg.val
 #define xfs_fstrm_centisecs	xfs_params.fstrm_timer.val
 #define xfs_eofb_secs		xfs_params.eofb_timer.val
+#define xfs_cowb_secs		xfs_params.cowb_timer.val
 
 #define current_cpu()		(raw_smp_processor_id())
 #define current_pid()		(current->pid)
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 846483d56949..9b3d7c76915d 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -45,6 +45,8 @@
 #include "xfs_dir2.h"
 #include "xfs_rmap_item.h"
 #include "xfs_buf_item.h"
+#include "xfs_refcount_item.h"
+#include "xfs_bmap_item.h"
 
 #define BLK_AVG(blk1, blk2)	((blk1+blk2) >> 1)
 
@@ -1924,6 +1926,10 @@ xlog_recover_reorder_trans(
 		case XFS_LI_EFI:
 		case XFS_LI_RUI:
 		case XFS_LI_RUD:
+		case XFS_LI_CUI:
+		case XFS_LI_CUD:
+		case XFS_LI_BUI:
+		case XFS_LI_BUD:
 			trace_xfs_log_recover_item_reorder_tail(log,
 							trans, item, pass);
 			list_move_tail(&item->ri_list, &inode_list);
@@ -2242,6 +2248,7 @@ xlog_recover_get_buf_lsn(
 	case XFS_ABTB_MAGIC:
 	case XFS_ABTC_MAGIC:
 	case XFS_RMAP_CRC_MAGIC:
+	case XFS_REFC_CRC_MAGIC:
 	case XFS_IBT_CRC_MAGIC:
 	case XFS_IBT_MAGIC: {
 		struct xfs_btree_block *btb = blk;
@@ -2415,6 +2422,9 @@ xlog_recover_validate_buf_type(
 		case XFS_RMAP_CRC_MAGIC:
 			bp->b_ops = &xfs_rmapbt_buf_ops;
 			break;
+		case XFS_REFC_CRC_MAGIC:
+			bp->b_ops = &xfs_refcountbt_buf_ops;
+			break;
 		default:
 			warnmsg = "Bad btree block magic!";
 			break;
@@ -3547,6 +3557,242 @@ xlog_recover_rud_pass2(
 }
 
 /*
+ * Copy an CUI format buffer from the given buf, and into the destination
+ * CUI format structure.  The CUI/CUD items were designed not to need any
+ * special alignment handling.
+ */
+static int
+xfs_cui_copy_format(
+	struct xfs_log_iovec		*buf,
+	struct xfs_cui_log_format	*dst_cui_fmt)
+{
+	struct xfs_cui_log_format	*src_cui_fmt;
+	uint				len;
+
+	src_cui_fmt = buf->i_addr;
+	len = xfs_cui_log_format_sizeof(src_cui_fmt->cui_nextents);
+
+	if (buf->i_len == len) {
+		memcpy(dst_cui_fmt, src_cui_fmt, len);
+		return 0;
+	}
+	return -EFSCORRUPTED;
+}
+
+/*
+ * This routine is called to create an in-core extent refcount update
+ * item from the cui format structure which was logged on disk.
+ * It allocates an in-core cui, copies the extents from the format
+ * structure into it, and adds the cui to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_cui_pass2(
+	struct xlog			*log,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	int				error;
+	struct xfs_mount		*mp = log->l_mp;
+	struct xfs_cui_log_item		*cuip;
+	struct xfs_cui_log_format	*cui_formatp;
+
+	cui_formatp = item->ri_buf[0].i_addr;
+
+	cuip = xfs_cui_init(mp, cui_formatp->cui_nextents);
+	error = xfs_cui_copy_format(&item->ri_buf[0], &cuip->cui_format);
+	if (error) {
+		xfs_cui_item_free(cuip);
+		return error;
+	}
+	atomic_set(&cuip->cui_next_extent, cui_formatp->cui_nextents);
+
+	spin_lock(&log->l_ailp->xa_lock);
+	/*
+	 * The CUI has two references. One for the CUD and one for CUI to ensure
+	 * it makes it into the AIL. Insert the CUI into the AIL directly and
+	 * drop the CUI reference. Note that xfs_trans_ail_update() drops the
+	 * AIL lock.
+	 */
+	xfs_trans_ail_update(log->l_ailp, &cuip->cui_item, lsn);
+	xfs_cui_release(cuip);
+	return 0;
+}
+
+
+/*
+ * This routine is called when an CUD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding CUI if it
+ * was still in the log. To do this it searches the AIL for the CUI with an id
+ * equal to that in the CUD format structure. If we find it we drop the CUD
+ * reference, which removes the CUI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_cud_pass2(
+	struct xlog			*log,
+	struct xlog_recover_item	*item)
+{
+	struct xfs_cud_log_format	*cud_formatp;
+	struct xfs_cui_log_item		*cuip = NULL;
+	struct xfs_log_item		*lip;
+	__uint64_t			cui_id;
+	struct xfs_ail_cursor		cur;
+	struct xfs_ail			*ailp = log->l_ailp;
+
+	cud_formatp = item->ri_buf[0].i_addr;
+	if (item->ri_buf[0].i_len != sizeof(struct xfs_cud_log_format))
+		return -EFSCORRUPTED;
+	cui_id = cud_formatp->cud_cui_id;
+
+	/*
+	 * Search for the CUI with the id in the CUD format structure in the
+	 * AIL.
+	 */
+	spin_lock(&ailp->xa_lock);
+	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+	while (lip != NULL) {
+		if (lip->li_type == XFS_LI_CUI) {
+			cuip = (struct xfs_cui_log_item *)lip;
+			if (cuip->cui_format.cui_id == cui_id) {
+				/*
+				 * Drop the CUD reference to the CUI. This
+				 * removes the CUI from the AIL and frees it.
+				 */
+				spin_unlock(&ailp->xa_lock);
+				xfs_cui_release(cuip);
+				spin_lock(&ailp->xa_lock);
+				break;
+			}
+		}
+		lip = xfs_trans_ail_cursor_next(ailp, &cur);
+	}
+
+	xfs_trans_ail_cursor_done(&cur);
+	spin_unlock(&ailp->xa_lock);
+
+	return 0;
+}
+
+/*
+ * Copy an BUI format buffer from the given buf, and into the destination
+ * BUI format structure.  The BUI/BUD items were designed not to need any
+ * special alignment handling.
+ */
+static int
+xfs_bui_copy_format(
+	struct xfs_log_iovec		*buf,
+	struct xfs_bui_log_format	*dst_bui_fmt)
+{
+	struct xfs_bui_log_format	*src_bui_fmt;
+	uint				len;
+
+	src_bui_fmt = buf->i_addr;
+	len = xfs_bui_log_format_sizeof(src_bui_fmt->bui_nextents);
+
+	if (buf->i_len == len) {
+		memcpy(dst_bui_fmt, src_bui_fmt, len);
+		return 0;
+	}
+	return -EFSCORRUPTED;
+}
+
+/*
+ * This routine is called to create an in-core extent bmap update
+ * item from the bui format structure which was logged on disk.
+ * It allocates an in-core bui, copies the extents from the format
+ * structure into it, and adds the bui to the AIL with the given
+ * LSN.
+ */
+STATIC int
+xlog_recover_bui_pass2(
+	struct xlog			*log,
+	struct xlog_recover_item	*item,
+	xfs_lsn_t			lsn)
+{
+	int				error;
+	struct xfs_mount		*mp = log->l_mp;
+	struct xfs_bui_log_item		*buip;
+	struct xfs_bui_log_format	*bui_formatp;
+
+	bui_formatp = item->ri_buf[0].i_addr;
+
+	if (bui_formatp->bui_nextents != XFS_BUI_MAX_FAST_EXTENTS)
+		return -EFSCORRUPTED;
+	buip = xfs_bui_init(mp);
+	error = xfs_bui_copy_format(&item->ri_buf[0], &buip->bui_format);
+	if (error) {
+		xfs_bui_item_free(buip);
+		return error;
+	}
+	atomic_set(&buip->bui_next_extent, bui_formatp->bui_nextents);
+
+	spin_lock(&log->l_ailp->xa_lock);
+	/*
+	 * The RUI has two references. One for the RUD and one for RUI to ensure
+	 * it makes it into the AIL. Insert the RUI into the AIL directly and
+	 * drop the RUI reference. Note that xfs_trans_ail_update() drops the
+	 * AIL lock.
+	 */
+	xfs_trans_ail_update(log->l_ailp, &buip->bui_item, lsn);
+	xfs_bui_release(buip);
+	return 0;
+}
+
+
+/*
+ * This routine is called when an BUD format structure is found in a committed
+ * transaction in the log. Its purpose is to cancel the corresponding BUI if it
+ * was still in the log. To do this it searches the AIL for the BUI with an id
+ * equal to that in the BUD format structure. If we find it we drop the BUD
+ * reference, which removes the BUI from the AIL and frees it.
+ */
+STATIC int
+xlog_recover_bud_pass2(
+	struct xlog			*log,
+	struct xlog_recover_item	*item)
+{
+	struct xfs_bud_log_format	*bud_formatp;
+	struct xfs_bui_log_item		*buip = NULL;
+	struct xfs_log_item		*lip;
+	__uint64_t			bui_id;
+	struct xfs_ail_cursor		cur;
+	struct xfs_ail			*ailp = log->l_ailp;
+
+	bud_formatp = item->ri_buf[0].i_addr;
+	if (item->ri_buf[0].i_len != sizeof(struct xfs_bud_log_format))
+		return -EFSCORRUPTED;
+	bui_id = bud_formatp->bud_bui_id;
+
+	/*
+	 * Search for the BUI with the id in the BUD format structure in the
+	 * AIL.
+	 */
+	spin_lock(&ailp->xa_lock);
+	lip = xfs_trans_ail_cursor_first(ailp, &cur, 0);
+	while (lip != NULL) {
+		if (lip->li_type == XFS_LI_BUI) {
+			buip = (struct xfs_bui_log_item *)lip;
+			if (buip->bui_format.bui_id == bui_id) {
+				/*
+				 * Drop the BUD reference to the BUI. This
+				 * removes the BUI from the AIL and frees it.
+				 */
+				spin_unlock(&ailp->xa_lock);
+				xfs_bui_release(buip);
+				spin_lock(&ailp->xa_lock);
+				break;
+			}
+		}
+		lip = xfs_trans_ail_cursor_next(ailp, &cur);
+	}
+
+	xfs_trans_ail_cursor_done(&cur);
+	spin_unlock(&ailp->xa_lock);
+
+	return 0;
+}
+
+/*
  * This routine is called when an inode create format structure is found in a
  * committed transaction in the log.  It's purpose is to initialise the inodes
  * being allocated on disk. This requires us to get inode cluster buffers that
@@ -3773,6 +4019,10 @@ xlog_recover_ra_pass2(
 	case XFS_LI_QUOTAOFF:
 	case XFS_LI_RUI:
 	case XFS_LI_RUD:
+	case XFS_LI_CUI:
+	case XFS_LI_CUD:
+	case XFS_LI_BUI:
+	case XFS_LI_BUD:
 	default:
 		break;
 	}
@@ -3798,6 +4048,10 @@ xlog_recover_commit_pass1(
 	case XFS_LI_ICREATE:
 	case XFS_LI_RUI:
 	case XFS_LI_RUD:
+	case XFS_LI_CUI:
+	case XFS_LI_CUD:
+	case XFS_LI_BUI:
+	case XFS_LI_BUD:
 		/* nothing to do in pass 1 */
 		return 0;
 	default:
@@ -3832,6 +4086,14 @@ xlog_recover_commit_pass2(
 		return xlog_recover_rui_pass2(log, item, trans->r_lsn);
 	case XFS_LI_RUD:
 		return xlog_recover_rud_pass2(log, item);
+	case XFS_LI_CUI:
+		return xlog_recover_cui_pass2(log, item, trans->r_lsn);
+	case XFS_LI_CUD:
+		return xlog_recover_cud_pass2(log, item);
+	case XFS_LI_BUI:
+		return xlog_recover_bui_pass2(log, item, trans->r_lsn);
+	case XFS_LI_BUD:
+		return xlog_recover_bud_pass2(log, item);
 	case XFS_LI_DQUOT:
 		return xlog_recover_dquot_pass2(log, buffer_list, item,
 						trans->r_lsn);
@@ -4419,12 +4681,94 @@ xlog_recover_cancel_rui(
 	spin_lock(&ailp->xa_lock);
 }
 
+/* Recover the CUI if necessary. */
+STATIC int
+xlog_recover_process_cui(
+	struct xfs_mount		*mp,
+	struct xfs_ail			*ailp,
+	struct xfs_log_item		*lip)
+{
+	struct xfs_cui_log_item		*cuip;
+	int				error;
+
+	/*
+	 * Skip CUIs that we've already processed.
+	 */
+	cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
+	if (test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags))
+		return 0;
+
+	spin_unlock(&ailp->xa_lock);
+	error = xfs_cui_recover(mp, cuip);
+	spin_lock(&ailp->xa_lock);
+
+	return error;
+}
+
+/* Release the CUI since we're cancelling everything. */
+STATIC void
+xlog_recover_cancel_cui(
+	struct xfs_mount		*mp,
+	struct xfs_ail			*ailp,
+	struct xfs_log_item		*lip)
+{
+	struct xfs_cui_log_item		*cuip;
+
+	cuip = container_of(lip, struct xfs_cui_log_item, cui_item);
+
+	spin_unlock(&ailp->xa_lock);
+	xfs_cui_release(cuip);
+	spin_lock(&ailp->xa_lock);
+}
+
+/* Recover the BUI if necessary. */
+STATIC int
+xlog_recover_process_bui(
+	struct xfs_mount		*mp,
+	struct xfs_ail			*ailp,
+	struct xfs_log_item		*lip)
+{
+	struct xfs_bui_log_item		*buip;
+	int				error;
+
+	/*
+	 * Skip BUIs that we've already processed.
+	 */
+	buip = container_of(lip, struct xfs_bui_log_item, bui_item);
+	if (test_bit(XFS_BUI_RECOVERED, &buip->bui_flags))
+		return 0;
+
+	spin_unlock(&ailp->xa_lock);
+	error = xfs_bui_recover(mp, buip);
+	spin_lock(&ailp->xa_lock);
+
+	return error;
+}
+
+/* Release the BUI since we're cancelling everything. */
+STATIC void
+xlog_recover_cancel_bui(
+	struct xfs_mount		*mp,
+	struct xfs_ail			*ailp,
+	struct xfs_log_item		*lip)
+{
+	struct xfs_bui_log_item		*buip;
+
+	buip = container_of(lip, struct xfs_bui_log_item, bui_item);
+
+	spin_unlock(&ailp->xa_lock);
+	xfs_bui_release(buip);
+	spin_lock(&ailp->xa_lock);
+}
+
 /* Is this log item a deferred action intent? */
 static inline bool xlog_item_is_intent(struct xfs_log_item *lip)
 {
 	switch (lip->li_type) {
 	case XFS_LI_EFI:
 	case XFS_LI_RUI:
+	case XFS_LI_CUI:
+	case XFS_LI_BUI:
 		return true;
 	default:
 		return false;
@@ -4488,6 +4832,12 @@ xlog_recover_process_intents(
 		case XFS_LI_RUI:
 			error = xlog_recover_process_rui(log->l_mp, ailp, lip);
 			break;
+		case XFS_LI_CUI:
+			error = xlog_recover_process_cui(log->l_mp, ailp, lip);
+			break;
+		case XFS_LI_BUI:
+			error = xlog_recover_process_bui(log->l_mp, ailp, lip);
+			break;
 		}
 		if (error)
 			goto out;
@@ -4535,6 +4885,12 @@ xlog_recover_cancel_intents(
 		case XFS_LI_RUI:
 			xlog_recover_cancel_rui(log->l_mp, ailp, lip);
 			break;
+		case XFS_LI_CUI:
+			xlog_recover_cancel_cui(log->l_mp, ailp, lip);
+			break;
+		case XFS_LI_BUI:
+			xlog_recover_cancel_bui(log->l_mp, ailp, lip);
+			break;
 		}
 
 		lip = xfs_trans_ail_cursor_next(ailp, &cur);
@@ -4613,6 +4969,7 @@ xlog_recover_process_one_iunlink(
 	if (error)
 		goto fail_iput;
 
+	xfs_iflags_clear(ip, XFS_IRECOVERY);
 	ASSERT(VFS_I(ip)->i_nlink == 0);
 	ASSERT(VFS_I(ip)->i_mode != 0);
 
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 56e85a6c85c7..fc7873942bea 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -43,6 +43,8 @@
 #include "xfs_icache.h"
 #include "xfs_sysfs.h"
 #include "xfs_rmap_btree.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_reflink.h"
 
 
 static DEFINE_MUTEX(xfs_uuid_table_mutex);
@@ -684,6 +686,7 @@ xfs_mountfs(
 	xfs_bmap_compute_maxlevels(mp, XFS_ATTR_FORK);
 	xfs_ialloc_compute_maxlevels(mp);
 	xfs_rmapbt_compute_maxlevels(mp);
+	xfs_refcountbt_compute_maxlevels(mp);
 
 	xfs_set_maxicount(mp);
 
@@ -923,6 +926,15 @@ xfs_mountfs(
 	}
 
 	/*
+	 * During the second phase of log recovery, we need iget and
+	 * iput to behave like they do for an active filesystem.
+	 * xfs_fs_drop_inode needs to be able to prevent the deletion
+	 * of inodes before we're done replaying log items on those
+	 * inodes.
+	 */
+	mp->m_super->s_flags |= MS_ACTIVE;
+
+	/*
 	 * Finish recovering the file system.  This part needed to be delayed
 	 * until after the root and real-time bitmap inodes were consistently
 	 * read in.
@@ -974,10 +986,28 @@ xfs_mountfs(
 		if (error)
 			xfs_warn(mp,
 	"Unable to allocate reserve blocks. Continuing without reserve pool.");
+
+		/* Recover any CoW blocks that never got remapped. */
+		error = xfs_reflink_recover_cow(mp);
+		if (error) {
+			xfs_err(mp,
+	"Error %d recovering leftover CoW allocations.", error);
+			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+			goto out_quota;
+		}
+
+		/* Reserve AG blocks for future btree expansion. */
+		error = xfs_fs_reserve_ag_blocks(mp);
+		if (error && error != -ENOSPC)
+			goto out_agresv;
 	}
 
 	return 0;
 
+ out_agresv:
+	xfs_fs_unreserve_ag_blocks(mp);
+ out_quota:
+	xfs_qm_unmount_quotas(mp);
  out_rtunmount:
 	xfs_rtunmount_inodes(mp);
  out_rele_rip:
@@ -1019,7 +1049,9 @@ xfs_unmountfs(
 	int			error;
 
 	cancel_delayed_work_sync(&mp->m_eofblocks_work);
+	cancel_delayed_work_sync(&mp->m_cowblocks_work);
 
+	xfs_fs_unreserve_ag_blocks(mp);
 	xfs_qm_unmount_quotas(mp);
 	xfs_rtunmount_inodes(mp);
 	IRELE(mp->m_rootip);
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index 041d9493e798..819b80b15bfb 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -124,10 +124,13 @@ typedef struct xfs_mount {
 	uint			m_inobt_mnr[2];	/* min inobt btree records */
 	uint			m_rmap_mxr[2];	/* max rmap btree records */
 	uint			m_rmap_mnr[2];	/* min rmap btree records */
+	uint			m_refc_mxr[2];	/* max refc btree records */
+	uint			m_refc_mnr[2];	/* min refc btree records */
 	uint			m_ag_maxlevels;	/* XFS_AG_MAXLEVELS */
 	uint			m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
 	uint			m_in_maxlevels;	/* max inobt btree levels. */
 	uint			m_rmap_maxlevels; /* max rmap btree levels */
+	uint			m_refc_maxlevels; /* max refcount btree level */
 	xfs_extlen_t		m_ag_prealloc_blocks; /* reserved ag blocks */
 	uint			m_alloc_set_aside; /* space we can't use */
 	uint			m_ag_max_usable; /* max space per AG */
@@ -161,6 +164,8 @@ typedef struct xfs_mount {
 	struct delayed_work	m_reclaim_work;	/* background inode reclaim */
 	struct delayed_work	m_eofblocks_work; /* background eof blocks
 						     trimming */
+	struct delayed_work	m_cowblocks_work; /* background cow blocks
+						     trimming */
 	bool			m_update_sb;	/* sb needs update in mount */
 	int64_t			m_low_space[XFS_LOWSP_MAX];
 						/* low free space thresholds */
@@ -399,6 +404,9 @@ typedef struct xfs_perag {
 	struct xfs_ag_resv	pag_meta_resv;
 	/* Blocks reserved for just AGFL-based metadata. */
 	struct xfs_ag_resv	pag_agfl_resv;
+
+	/* reference count */
+	__uint8_t		pagf_refcount_level;
 } xfs_perag_t;
 
 static inline struct xfs_ag_resv *
diff --git a/fs/xfs/xfs_ondisk.h b/fs/xfs/xfs_ondisk.h
index 69e2986a3776..0c381d71b242 100644
--- a/fs/xfs/xfs_ondisk.h
+++ b/fs/xfs/xfs_ondisk.h
@@ -49,6 +49,8 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(struct xfs_dsymlink_hdr,		56);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_key,		4);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_inobt_rec,		16);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_key,		4);
+	XFS_CHECK_STRUCT_SIZE(struct xfs_refcount_rec,		12);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_key,		20);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_rmap_rec,		24);
 	XFS_CHECK_STRUCT_SIZE(struct xfs_timestamp,		8);
@@ -56,6 +58,7 @@ xfs_check_ondisk_structs(void)
 	XFS_CHECK_STRUCT_SIZE(xfs_alloc_ptr_t,			4);
 	XFS_CHECK_STRUCT_SIZE(xfs_alloc_rec_t,			8);
 	XFS_CHECK_STRUCT_SIZE(xfs_inobt_ptr_t,			4);
+	XFS_CHECK_STRUCT_SIZE(xfs_refcount_ptr_t,		4);
 	XFS_CHECK_STRUCT_SIZE(xfs_rmap_ptr_t,			4);
 
 	/* dir/attr trees */
diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c
index 0f14b2e4bf6c..93a7aafa56d6 100644
--- a/fs/xfs/xfs_pnfs.c
+++ b/fs/xfs/xfs_pnfs.c
@@ -114,6 +114,13 @@ xfs_fs_map_blocks(
 		return -ENXIO;
 
 	/*
+	 * The pNFS block layout spec actually supports reflink like
+	 * functionality, but the Linux pNFS server doesn't implement it yet.
+	 */
+	if (xfs_is_reflink_inode(ip))
+		return -ENXIO;
+
+	/*
 	 * Lock out any other I/O before we flush and invalidate the pagecache,
 	 * and then hand out a layout to the remote system.  This is very
 	 * similar to direct I/O, except that the synchronization is much more
diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c
new file mode 100644
index 000000000000..fe86a668a57e
--- /dev/null
+++ b/fs/xfs/xfs_refcount_item.c
@@ -0,0 +1,539 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_bit.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_buf_item.h"
+#include "xfs_refcount_item.h"
+#include "xfs_log.h"
+#include "xfs_refcount.h"
+
+
+kmem_zone_t	*xfs_cui_zone;
+kmem_zone_t	*xfs_cud_zone;
+
+static inline struct xfs_cui_log_item *CUI_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_cui_log_item, cui_item);
+}
+
+void
+xfs_cui_item_free(
+	struct xfs_cui_log_item	*cuip)
+{
+	if (cuip->cui_format.cui_nextents > XFS_CUI_MAX_FAST_EXTENTS)
+		kmem_free(cuip);
+	else
+		kmem_zone_free(xfs_cui_zone, cuip);
+}
+
+STATIC void
+xfs_cui_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	struct xfs_cui_log_item	*cuip = CUI_ITEM(lip);
+
+	*nvecs += 1;
+	*nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given cui log item. We use only 1 iovec, and we point that
+ * at the cui_log_format structure embedded in the cui item.
+ * It is at this point that we assert that all of the extent
+ * slots in the cui item have been filled.
+ */
+STATIC void
+xfs_cui_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_cui_log_item	*cuip = CUI_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	ASSERT(atomic_read(&cuip->cui_next_extent) ==
+			cuip->cui_format.cui_nextents);
+
+	cuip->cui_format.cui_type = XFS_LI_CUI;
+	cuip->cui_format.cui_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUI_FORMAT, &cuip->cui_format,
+			xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents));
+}
+
+/*
+ * Pinning has no meaning for an cui item, so just return.
+ */
+STATIC void
+xfs_cui_item_pin(
+	struct xfs_log_item	*lip)
+{
+}
+
+/*
+ * The unpin operation is the last place an CUI is manipulated in the log. It is
+ * either inserted in the AIL or aborted in the event of a log I/O error. In
+ * either case, the CUI transaction has been successfully committed to make it
+ * this far. Therefore, we expect whoever committed the CUI to either construct
+ * and commit the CUD or drop the CUD's reference in the event of error. Simply
+ * drop the log's CUI reference now that the log is done with it.
+ */
+STATIC void
+xfs_cui_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+	struct xfs_cui_log_item	*cuip = CUI_ITEM(lip);
+
+	xfs_cui_release(cuip);
+}
+
+/*
+ * CUI items have no locking or pushing.  However, since CUIs are pulled from
+ * the AIL when their corresponding CUDs are committed to disk, their situation
+ * is very similar to being pinned.  Return XFS_ITEM_PINNED so that the caller
+ * will eventually flush the log.  This should help in getting the CUI out of
+ * the AIL.
+ */
+STATIC uint
+xfs_cui_item_push(
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list)
+{
+	return XFS_ITEM_PINNED;
+}
+
+/*
+ * The CUI has been either committed or aborted if the transaction has been
+ * cancelled. If the transaction was cancelled, an CUD isn't going to be
+ * constructed and thus we free the CUI here directly.
+ */
+STATIC void
+xfs_cui_item_unlock(
+	struct xfs_log_item	*lip)
+{
+	if (lip->li_flags & XFS_LI_ABORTED)
+		xfs_cui_item_free(CUI_ITEM(lip));
+}
+
+/*
+ * The CUI is logged only once and cannot be moved in the log, so simply return
+ * the lsn at which it's been logged.
+ */
+STATIC xfs_lsn_t
+xfs_cui_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	return lsn;
+}
+
+/*
+ * The CUI dependency tracking op doesn't do squat.  It can't because
+ * it doesn't know where the free extent is coming from.  The dependency
+ * tracking has to be handled by the "enclosing" metadata object.  For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_cui_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all cui log items.
+ */
+static const struct xfs_item_ops xfs_cui_item_ops = {
+	.iop_size	= xfs_cui_item_size,
+	.iop_format	= xfs_cui_item_format,
+	.iop_pin	= xfs_cui_item_pin,
+	.iop_unpin	= xfs_cui_item_unpin,
+	.iop_unlock	= xfs_cui_item_unlock,
+	.iop_committed	= xfs_cui_item_committed,
+	.iop_push	= xfs_cui_item_push,
+	.iop_committing = xfs_cui_item_committing,
+};
+
+/*
+ * Allocate and initialize an cui item with the given number of extents.
+ */
+struct xfs_cui_log_item *
+xfs_cui_init(
+	struct xfs_mount		*mp,
+	uint				nextents)
+
+{
+	struct xfs_cui_log_item		*cuip;
+
+	ASSERT(nextents > 0);
+	if (nextents > XFS_CUI_MAX_FAST_EXTENTS)
+		cuip = kmem_zalloc(xfs_cui_log_item_sizeof(nextents),
+				KM_SLEEP);
+	else
+		cuip = kmem_zone_zalloc(xfs_cui_zone, KM_SLEEP);
+
+	xfs_log_item_init(mp, &cuip->cui_item, XFS_LI_CUI, &xfs_cui_item_ops);
+	cuip->cui_format.cui_nextents = nextents;
+	cuip->cui_format.cui_id = (uintptr_t)(void *)cuip;
+	atomic_set(&cuip->cui_next_extent, 0);
+	atomic_set(&cuip->cui_refcount, 2);
+
+	return cuip;
+}
+
+/*
+ * Freeing the CUI requires that we remove it from the AIL if it has already
+ * been placed there. However, the CUI may not yet have been placed in the AIL
+ * when called by xfs_cui_release() from CUD processing due to the ordering of
+ * committed vs unpin operations in bulk insert operations. Hence the reference
+ * count to ensure only the last caller frees the CUI.
+ */
+void
+xfs_cui_release(
+	struct xfs_cui_log_item	*cuip)
+{
+	if (atomic_dec_and_test(&cuip->cui_refcount)) {
+		xfs_trans_ail_remove(&cuip->cui_item, SHUTDOWN_LOG_IO_ERROR);
+		xfs_cui_item_free(cuip);
+	}
+}
+
+static inline struct xfs_cud_log_item *CUD_ITEM(struct xfs_log_item *lip)
+{
+	return container_of(lip, struct xfs_cud_log_item, cud_item);
+}
+
+STATIC void
+xfs_cud_item_size(
+	struct xfs_log_item	*lip,
+	int			*nvecs,
+	int			*nbytes)
+{
+	*nvecs += 1;
+	*nbytes += sizeof(struct xfs_cud_log_format);
+}
+
+/*
+ * This is called to fill in the vector of log iovecs for the
+ * given cud log item. We use only 1 iovec, and we point that
+ * at the cud_log_format structure embedded in the cud item.
+ * It is at this point that we assert that all of the extent
+ * slots in the cud item have been filled.
+ */
+STATIC void
+xfs_cud_item_format(
+	struct xfs_log_item	*lip,
+	struct xfs_log_vec	*lv)
+{
+	struct xfs_cud_log_item	*cudp = CUD_ITEM(lip);
+	struct xfs_log_iovec	*vecp = NULL;
+
+	cudp->cud_format.cud_type = XFS_LI_CUD;
+	cudp->cud_format.cud_size = 1;
+
+	xlog_copy_iovec(lv, &vecp, XLOG_REG_TYPE_CUD_FORMAT, &cudp->cud_format,
+			sizeof(struct xfs_cud_log_format));
+}
+
+/*
+ * Pinning has no meaning for an cud item, so just return.
+ */
+STATIC void
+xfs_cud_item_pin(
+	struct xfs_log_item	*lip)
+{
+}
+
+/*
+ * Since pinning has no meaning for an cud item, unpinning does
+ * not either.
+ */
+STATIC void
+xfs_cud_item_unpin(
+	struct xfs_log_item	*lip,
+	int			remove)
+{
+}
+
+/*
+ * There isn't much you can do to push on an cud item.  It is simply stuck
+ * waiting for the log to be flushed to disk.
+ */
+STATIC uint
+xfs_cud_item_push(
+	struct xfs_log_item	*lip,
+	struct list_head	*buffer_list)
+{
+	return XFS_ITEM_PINNED;
+}
+
+/*
+ * The CUD is either committed or aborted if the transaction is cancelled. If
+ * the transaction is cancelled, drop our reference to the CUI and free the
+ * CUD.
+ */
+STATIC void
+xfs_cud_item_unlock(
+	struct xfs_log_item	*lip)
+{
+	struct xfs_cud_log_item	*cudp = CUD_ITEM(lip);
+
+	if (lip->li_flags & XFS_LI_ABORTED) {
+		xfs_cui_release(cudp->cud_cuip);
+		kmem_zone_free(xfs_cud_zone, cudp);
+	}
+}
+
+/*
+ * When the cud item is committed to disk, all we need to do is delete our
+ * reference to our partner cui item and then free ourselves. Since we're
+ * freeing ourselves we must return -1 to keep the transaction code from
+ * further referencing this item.
+ */
+STATIC xfs_lsn_t
+xfs_cud_item_committed(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+	struct xfs_cud_log_item	*cudp = CUD_ITEM(lip);
+
+	/*
+	 * Drop the CUI reference regardless of whether the CUD has been
+	 * aborted. Once the CUD transaction is constructed, it is the sole
+	 * responsibility of the CUD to release the CUI (even if the CUI is
+	 * aborted due to log I/O error).
+	 */
+	xfs_cui_release(cudp->cud_cuip);
+	kmem_zone_free(xfs_cud_zone, cudp);
+
+	return (xfs_lsn_t)-1;
+}
+
+/*
+ * The CUD dependency tracking op doesn't do squat.  It can't because
+ * it doesn't know where the free extent is coming from.  The dependency
+ * tracking has to be handled by the "enclosing" metadata object.  For
+ * example, for inodes, the inode is locked throughout the extent freeing
+ * so the dependency should be recorded there.
+ */
+STATIC void
+xfs_cud_item_committing(
+	struct xfs_log_item	*lip,
+	xfs_lsn_t		lsn)
+{
+}
+
+/*
+ * This is the ops vector shared by all cud log items.
+ */
+static const struct xfs_item_ops xfs_cud_item_ops = {
+	.iop_size	= xfs_cud_item_size,
+	.iop_format	= xfs_cud_item_format,
+	.iop_pin	= xfs_cud_item_pin,
+	.iop_unpin	= xfs_cud_item_unpin,
+	.iop_unlock	= xfs_cud_item_unlock,
+	.iop_committed	= xfs_cud_item_committed,
+	.iop_push	= xfs_cud_item_push,
+	.iop_committing = xfs_cud_item_committing,
+};
+
+/*
+ * Allocate and initialize an cud item with the given number of extents.
+ */
+struct xfs_cud_log_item *
+xfs_cud_init(
+	struct xfs_mount		*mp,
+	struct xfs_cui_log_item		*cuip)
+
+{
+	struct xfs_cud_log_item	*cudp;
+
+	cudp = kmem_zone_zalloc(xfs_cud_zone, KM_SLEEP);
+	xfs_log_item_init(mp, &cudp->cud_item, XFS_LI_CUD, &xfs_cud_item_ops);
+	cudp->cud_cuip = cuip;
+	cudp->cud_format.cud_cui_id = cuip->cui_format.cui_id;
+
+	return cudp;
+}
+
+/*
+ * Process a refcount update intent item that was recovered from the log.
+ * We need to update the refcountbt.
+ */
+int
+xfs_cui_recover(
+	struct xfs_mount		*mp,
+	struct xfs_cui_log_item		*cuip)
+{
+	int				i;
+	int				error = 0;
+	unsigned int			refc_type;
+	struct xfs_phys_extent		*refc;
+	xfs_fsblock_t			startblock_fsb;
+	bool				op_ok;
+	struct xfs_cud_log_item		*cudp;
+	struct xfs_trans		*tp;
+	struct xfs_btree_cur		*rcur = NULL;
+	enum xfs_refcount_intent_type	type;
+	xfs_fsblock_t			firstfsb;
+	xfs_fsblock_t			new_fsb;
+	xfs_extlen_t			new_len;
+	struct xfs_bmbt_irec		irec;
+	struct xfs_defer_ops		dfops;
+	bool				requeue_only = false;
+
+	ASSERT(!test_bit(XFS_CUI_RECOVERED, &cuip->cui_flags));
+
+	/*
+	 * First check the validity of the extents described by the
+	 * CUI.  If any are bad, then assume that all are bad and
+	 * just toss the CUI.
+	 */
+	for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
+		refc = &cuip->cui_format.cui_extents[i];
+		startblock_fsb = XFS_BB_TO_FSB(mp,
+				   XFS_FSB_TO_DADDR(mp, refc->pe_startblock));
+		switch (refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK) {
+		case XFS_REFCOUNT_INCREASE:
+		case XFS_REFCOUNT_DECREASE:
+		case XFS_REFCOUNT_ALLOC_COW:
+		case XFS_REFCOUNT_FREE_COW:
+			op_ok = true;
+			break;
+		default:
+			op_ok = false;
+			break;
+		}
+		if (!op_ok || startblock_fsb == 0 ||
+		    refc->pe_len == 0 ||
+		    startblock_fsb >= mp->m_sb.sb_dblocks ||
+		    refc->pe_len >= mp->m_sb.sb_agblocks ||
+		    (refc->pe_flags & ~XFS_REFCOUNT_EXTENT_FLAGS)) {
+			/*
+			 * This will pull the CUI from the AIL and
+			 * free the memory associated with it.
+			 */
+			set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
+			xfs_cui_release(cuip);
+			return -EIO;
+		}
+	}
+
+	/*
+	 * Under normal operation, refcount updates are deferred, so we
+	 * wouldn't be adding them directly to a transaction.  All
+	 * refcount updates manage reservation usage internally and
+	 * dynamically by deferring work that won't fit in the
+	 * transaction.  Normally, any work that needs to be deferred
+	 * gets attached to the same defer_ops that scheduled the
+	 * refcount update.  However, we're in log recovery here, so we
+	 * we create our own defer_ops and use that to finish up any
+	 * work that doesn't fit.
+	 */
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_itruncate, 0, 0, 0, &tp);
+	if (error)
+		return error;
+	cudp = xfs_trans_get_cud(tp, cuip);
+
+	xfs_defer_init(&dfops, &firstfsb);
+	for (i = 0; i < cuip->cui_format.cui_nextents; i++) {
+		refc = &cuip->cui_format.cui_extents[i];
+		refc_type = refc->pe_flags & XFS_REFCOUNT_EXTENT_TYPE_MASK;
+		switch (refc_type) {
+		case XFS_REFCOUNT_INCREASE:
+		case XFS_REFCOUNT_DECREASE:
+		case XFS_REFCOUNT_ALLOC_COW:
+		case XFS_REFCOUNT_FREE_COW:
+			type = refc_type;
+			break;
+		default:
+			error = -EFSCORRUPTED;
+			goto abort_error;
+		}
+		if (requeue_only) {
+			new_fsb = refc->pe_startblock;
+			new_len = refc->pe_len;
+		} else
+			error = xfs_trans_log_finish_refcount_update(tp, cudp,
+				&dfops, type, refc->pe_startblock, refc->pe_len,
+				&new_fsb, &new_len, &rcur);
+		if (error)
+			goto abort_error;
+
+		/* Requeue what we didn't finish. */
+		if (new_len > 0) {
+			irec.br_startblock = new_fsb;
+			irec.br_blockcount = new_len;
+			switch (type) {
+			case XFS_REFCOUNT_INCREASE:
+				error = xfs_refcount_increase_extent(
+						tp->t_mountp, &dfops, &irec);
+				break;
+			case XFS_REFCOUNT_DECREASE:
+				error = xfs_refcount_decrease_extent(
+						tp->t_mountp, &dfops, &irec);
+				break;
+			case XFS_REFCOUNT_ALLOC_COW:
+				error = xfs_refcount_alloc_cow_extent(
+						tp->t_mountp, &dfops,
+						irec.br_startblock,
+						irec.br_blockcount);
+				break;
+			case XFS_REFCOUNT_FREE_COW:
+				error = xfs_refcount_free_cow_extent(
+						tp->t_mountp, &dfops,
+						irec.br_startblock,
+						irec.br_blockcount);
+				break;
+			default:
+				ASSERT(0);
+			}
+			if (error)
+				goto abort_error;
+			requeue_only = true;
+		}
+	}
+
+	xfs_refcount_finish_one_cleanup(tp, rcur, error);
+	error = xfs_defer_finish(&tp, &dfops, NULL);
+	if (error)
+		goto abort_error;
+	set_bit(XFS_CUI_RECOVERED, &cuip->cui_flags);
+	error = xfs_trans_commit(tp);
+	return error;
+
+abort_error:
+	xfs_refcount_finish_one_cleanup(tp, rcur, error);
+	xfs_defer_cancel(&dfops);
+	xfs_trans_cancel(tp);
+	return error;
+}
diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h
new file mode 100644
index 000000000000..5b74dddfa64b
--- /dev/null
+++ b/fs/xfs/xfs_refcount_item.h
@@ -0,0 +1,101 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#ifndef	__XFS_REFCOUNT_ITEM_H__
+#define	__XFS_REFCOUNT_ITEM_H__
+
+/*
+ * There are (currently) two pairs of refcount btree redo item types:
+ * increase and decrease.  The log items for these are CUI (refcount
+ * update intent) and CUD (refcount update done).  The redo item type
+ * is encoded in the flags field of each xfs_map_extent.
+ *
+ * *I items should be recorded in the *first* of a series of rolled
+ * transactions, and the *D items should be recorded in the same
+ * transaction that records the associated refcountbt updates.
+ *
+ * Should the system crash after the commit of the first transaction
+ * but before the commit of the final transaction in a series, log
+ * recovery will use the redo information recorded by the intent items
+ * to replay the refcountbt metadata updates.
+ */
+
+/* kernel only CUI/CUD definitions */
+
+struct xfs_mount;
+struct kmem_zone;
+
+/*
+ * Max number of extents in fast allocation path.
+ */
+#define	XFS_CUI_MAX_FAST_EXTENTS	16
+
+/*
+ * Define CUI flag bits. Manipulated by set/clear/test_bit operators.
+ */
+#define	XFS_CUI_RECOVERED		1
+
+/*
+ * This is the "refcount update intent" log item.  It is used to log
+ * the fact that some reverse mappings need to change.  It is used in
+ * conjunction with the "refcount update done" log item described
+ * below.
+ *
+ * These log items follow the same rules as struct xfs_efi_log_item;
+ * see the comments about that structure (in xfs_extfree_item.h) for
+ * more details.
+ */
+struct xfs_cui_log_item {
+	struct xfs_log_item		cui_item;
+	atomic_t			cui_refcount;
+	atomic_t			cui_next_extent;
+	unsigned long			cui_flags;	/* misc flags */
+	struct xfs_cui_log_format	cui_format;
+};
+
+static inline size_t
+xfs_cui_log_item_sizeof(
+	unsigned int		nr)
+{
+	return offsetof(struct xfs_cui_log_item, cui_format) +
+			xfs_cui_log_format_sizeof(nr);
+}
+
+/*
+ * This is the "refcount update done" log item.  It is used to log the
+ * fact that some refcountbt updates mentioned in an earlier cui item
+ * have been performed.
+ */
+struct xfs_cud_log_item {
+	struct xfs_log_item		cud_item;
+	struct xfs_cui_log_item		*cud_cuip;
+	struct xfs_cud_log_format	cud_format;
+};
+
+extern struct kmem_zone	*xfs_cui_zone;
+extern struct kmem_zone	*xfs_cud_zone;
+
+struct xfs_cui_log_item *xfs_cui_init(struct xfs_mount *, uint);
+struct xfs_cud_log_item *xfs_cud_init(struct xfs_mount *,
+		struct xfs_cui_log_item *);
+void xfs_cui_item_free(struct xfs_cui_log_item *);
+void xfs_cui_release(struct xfs_cui_log_item *);
+int xfs_cui_recover(struct xfs_mount *mp, struct xfs_cui_log_item *cuip);
+
+#endif	/* __XFS_REFCOUNT_ITEM_H__ */
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
new file mode 100644
index 000000000000..5965e9455d91
--- /dev/null
+++ b/fs/xfs/xfs_reflink.c
@@ -0,0 +1,1688 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_da_format.h"
+#include "xfs_da_btree.h"
+#include "xfs_inode.h"
+#include "xfs_trans.h"
+#include "xfs_inode_item.h"
+#include "xfs_bmap.h"
+#include "xfs_bmap_util.h"
+#include "xfs_error.h"
+#include "xfs_dir2.h"
+#include "xfs_dir2_priv.h"
+#include "xfs_ioctl.h"
+#include "xfs_trace.h"
+#include "xfs_log.h"
+#include "xfs_icache.h"
+#include "xfs_pnfs.h"
+#include "xfs_btree.h"
+#include "xfs_refcount_btree.h"
+#include "xfs_refcount.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_trans_space.h"
+#include "xfs_bit.h"
+#include "xfs_alloc.h"
+#include "xfs_quota_defs.h"
+#include "xfs_quota.h"
+#include "xfs_btree.h"
+#include "xfs_bmap_btree.h"
+#include "xfs_reflink.h"
+#include "xfs_iomap.h"
+#include "xfs_rmap_btree.h"
+#include "xfs_sb.h"
+#include "xfs_ag_resv.h"
+
+/*
+ * Copy on Write of Shared Blocks
+ *
+ * XFS must preserve "the usual" file semantics even when two files share
+ * the same physical blocks.  This means that a write to one file must not
+ * alter the blocks in a different file; the way that we'll do that is
+ * through the use of a copy-on-write mechanism.  At a high level, that
+ * means that when we want to write to a shared block, we allocate a new
+ * block, write the data to the new block, and if that succeeds we map the
+ * new block into the file.
+ *
+ * XFS provides a "delayed allocation" mechanism that defers the allocation
+ * of disk blocks to dirty-but-not-yet-mapped file blocks as long as
+ * possible.  This reduces fragmentation by enabling the filesystem to ask
+ * for bigger chunks less often, which is exactly what we want for CoW.
+ *
+ * The delalloc mechanism begins when the kernel wants to make a block
+ * writable (write_begin or page_mkwrite).  If the offset is not mapped, we
+ * create a delalloc mapping, which is a regular in-core extent, but without
+ * a real startblock.  (For delalloc mappings, the startblock encodes both
+ * a flag that this is a delalloc mapping, and a worst-case estimate of how
+ * many blocks might be required to put the mapping into the BMBT.)  delalloc
+ * mappings are a reservation against the free space in the filesystem;
+ * adjacent mappings can also be combined into fewer larger mappings.
+ *
+ * When dirty pages are being written out (typically in writepage), the
+ * delalloc reservations are converted into real mappings by allocating
+ * blocks and replacing the delalloc mapping with real ones.  A delalloc
+ * mapping can be replaced by several real ones if the free space is
+ * fragmented.
+ *
+ * We want to adapt the delalloc mechanism for copy-on-write, since the
+ * write paths are similar.  The first two steps (creating the reservation
+ * and allocating the blocks) are exactly the same as delalloc except that
+ * the mappings must be stored in a separate CoW fork because we do not want
+ * to disturb the mapping in the data fork until we're sure that the write
+ * succeeded.  IO completion in this case is the process of removing the old
+ * mapping from the data fork and moving the new mapping from the CoW fork to
+ * the data fork.  This will be discussed shortly.
+ *
+ * For now, unaligned directio writes will be bounced back to the page cache.
+ * Block-aligned directio writes will use the same mechanism as buffered
+ * writes.
+ *
+ * CoW remapping must be done after the data block write completes,
+ * because we don't want to destroy the old data fork map until we're sure
+ * the new block has been written.  Since the new mappings are kept in a
+ * separate fork, we can simply iterate these mappings to find the ones
+ * that cover the file blocks that we just CoW'd.  For each extent, simply
+ * unmap the corresponding range in the data fork, map the new range into
+ * the data fork, and remove the extent from the CoW fork.
+ *
+ * Since the remapping operation can be applied to an arbitrary file
+ * range, we record the need for the remap step as a flag in the ioend
+ * instead of declaring a new IO type.  This is required for direct io
+ * because we only have ioend for the whole dio, and we have to be able to
+ * remember the presence of unwritten blocks and CoW blocks with a single
+ * ioend structure.  Better yet, the more ground we can cover with one
+ * ioend, the better.
+ */
+
+/*
+ * Given an AG extent, find the lowest-numbered run of shared blocks
+ * within that range and return the range in fbno/flen.  If
+ * find_end_of_shared is true, return the longest contiguous extent of
+ * shared blocks.  If there are no shared extents, fbno and flen will
+ * be set to NULLAGBLOCK and 0, respectively.
+ */
+int
+xfs_reflink_find_shared(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno,
+	xfs_agblock_t		agbno,
+	xfs_extlen_t		aglen,
+	xfs_agblock_t		*fbno,
+	xfs_extlen_t		*flen,
+	bool			find_end_of_shared)
+{
+	struct xfs_buf		*agbp;
+	struct xfs_btree_cur	*cur;
+	int			error;
+
+	error = xfs_alloc_read_agf(mp, NULL, agno, 0, &agbp);
+	if (error)
+		return error;
+
+	cur = xfs_refcountbt_init_cursor(mp, NULL, agbp, agno, NULL);
+
+	error = xfs_refcount_find_shared(cur, agbno, aglen, fbno, flen,
+			find_end_of_shared);
+
+	xfs_btree_del_cursor(cur, error ? XFS_BTREE_ERROR : XFS_BTREE_NOERROR);
+
+	xfs_buf_relse(agbp);
+	return error;
+}
+
+/*
+ * Trim the mapping to the next block where there's a change in the
+ * shared/unshared status.  More specifically, this means that we
+ * find the lowest-numbered extent of shared blocks that coincides with
+ * the given block mapping.  If the shared extent overlaps the start of
+ * the mapping, trim the mapping to the end of the shared extent.  If
+ * the shared region intersects the mapping, trim the mapping to the
+ * start of the shared extent.  If there are no shared regions that
+ * overlap, just return the original extent.
+ */
+int
+xfs_reflink_trim_around_shared(
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*irec,
+	bool			*shared,
+	bool			*trimmed)
+{
+	xfs_agnumber_t		agno;
+	xfs_agblock_t		agbno;
+	xfs_extlen_t		aglen;
+	xfs_agblock_t		fbno;
+	xfs_extlen_t		flen;
+	int			error = 0;
+
+	/* Holes, unwritten, and delalloc extents cannot be shared */
+	if (!xfs_is_reflink_inode(ip) ||
+	    ISUNWRITTEN(irec) ||
+	    irec->br_startblock == HOLESTARTBLOCK ||
+	    irec->br_startblock == DELAYSTARTBLOCK) {
+		*shared = false;
+		return 0;
+	}
+
+	trace_xfs_reflink_trim_around_shared(ip, irec);
+
+	agno = XFS_FSB_TO_AGNO(ip->i_mount, irec->br_startblock);
+	agbno = XFS_FSB_TO_AGBNO(ip->i_mount, irec->br_startblock);
+	aglen = irec->br_blockcount;
+
+	error = xfs_reflink_find_shared(ip->i_mount, agno, agbno,
+			aglen, &fbno, &flen, true);
+	if (error)
+		return error;
+
+	*shared = *trimmed = false;
+	if (fbno == NULLAGBLOCK) {
+		/* No shared blocks at all. */
+		return 0;
+	} else if (fbno == agbno) {
+		/*
+		 * The start of this extent is shared.  Truncate the
+		 * mapping at the end of the shared region so that a
+		 * subsequent iteration starts at the start of the
+		 * unshared region.
+		 */
+		irec->br_blockcount = flen;
+		*shared = true;
+		if (flen != aglen)
+			*trimmed = true;
+		return 0;
+	} else {
+		/*
+		 * There's a shared extent midway through this extent.
+		 * Truncate the mapping at the start of the shared
+		 * extent so that a subsequent iteration starts at the
+		 * start of the shared region.
+		 */
+		irec->br_blockcount = fbno - agbno;
+		*trimmed = true;
+		return 0;
+	}
+}
+
+/* Create a CoW reservation for a range of blocks within a file. */
+static int
+__xfs_reflink_reserve_cow(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		*offset_fsb,
+	xfs_fileoff_t		end_fsb,
+	bool			*skipped)
+{
+	struct xfs_bmbt_irec	got, prev, imap;
+	xfs_fileoff_t		orig_end_fsb;
+	int			nimaps, eof = 0, error = 0;
+	bool			shared = false, trimmed = false;
+	xfs_extnum_t		idx;
+	xfs_extlen_t		align;
+
+	/* Already reserved?  Skip the refcount btree access. */
+	xfs_bmap_search_extents(ip, *offset_fsb, XFS_COW_FORK, &eof, &idx,
+			&got, &prev);
+	if (!eof && got.br_startoff <= *offset_fsb) {
+		end_fsb = orig_end_fsb = got.br_startoff + got.br_blockcount;
+		trace_xfs_reflink_cow_found(ip, &got);
+		goto done;
+	}
+
+	/* Read extent from the source file. */
+	nimaps = 1;
+	error = xfs_bmapi_read(ip, *offset_fsb, end_fsb - *offset_fsb,
+			&imap, &nimaps, 0);
+	if (error)
+		goto out_unlock;
+	ASSERT(nimaps == 1);
+
+	/* Trim the mapping to the nearest shared extent boundary. */
+	error = xfs_reflink_trim_around_shared(ip, &imap, &shared, &trimmed);
+	if (error)
+		goto out_unlock;
+
+	end_fsb = orig_end_fsb = imap.br_startoff + imap.br_blockcount;
+
+	/* Not shared?  Just report the (potentially capped) extent. */
+	if (!shared) {
+		*skipped = true;
+		goto done;
+	}
+
+	/*
+	 * Fork all the shared blocks from our write offset until the end of
+	 * the extent.
+	 */
+	error = xfs_qm_dqattach_locked(ip, 0);
+	if (error)
+		goto out_unlock;
+
+	align = xfs_eof_alignment(ip, xfs_get_cowextsz_hint(ip));
+	if (align)
+		end_fsb = roundup_64(end_fsb, align);
+
+retry:
+	error = xfs_bmapi_reserve_delalloc(ip, XFS_COW_FORK, *offset_fsb,
+			end_fsb - *offset_fsb, &got,
+			&prev, &idx, eof);
+	switch (error) {
+	case 0:
+		break;
+	case -ENOSPC:
+	case -EDQUOT:
+		/* retry without any preallocation */
+		trace_xfs_reflink_cow_enospc(ip, &imap);
+		if (end_fsb != orig_end_fsb) {
+			end_fsb = orig_end_fsb;
+			goto retry;
+		}
+		/*FALLTHRU*/
+	default:
+		goto out_unlock;
+	}
+
+	if (end_fsb != orig_end_fsb)
+		xfs_inode_set_cowblocks_tag(ip);
+
+	trace_xfs_reflink_cow_alloc(ip, &got);
+done:
+	*offset_fsb = end_fsb;
+out_unlock:
+	return error;
+}
+
+/* Create a CoW reservation for part of a file. */
+int
+xfs_reflink_reserve_cow_range(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		count)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		offset_fsb, end_fsb;
+	bool			skipped = false;
+	int			error;
+
+	trace_xfs_reflink_reserve_cow_range(ip, offset, count);
+
+	offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	end_fsb = XFS_B_TO_FSB(mp, offset + count);
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	while (offset_fsb < end_fsb) {
+		error = __xfs_reflink_reserve_cow(ip, &offset_fsb, end_fsb,
+				&skipped);
+		if (error) {
+			trace_xfs_reflink_reserve_cow_range_error(ip, error,
+				_RET_IP_);
+			break;
+		}
+	}
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	return error;
+}
+
+/* Allocate all CoW reservations covering a range of blocks in a file. */
+static int
+__xfs_reflink_allocate_cow(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		*offset_fsb,
+	xfs_fileoff_t		end_fsb)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_bmbt_irec	imap;
+	struct xfs_defer_ops	dfops;
+	struct xfs_trans	*tp;
+	xfs_fsblock_t		first_block;
+	xfs_fileoff_t		next_fsb;
+	int			nimaps = 1, error;
+	bool			skipped = false;
+
+	xfs_defer_init(&dfops, &first_block);
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0,
+			XFS_TRANS_RESERVE, &tp);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+
+	next_fsb = *offset_fsb;
+	error = __xfs_reflink_reserve_cow(ip, &next_fsb, end_fsb, &skipped);
+	if (error)
+		goto out_trans_cancel;
+
+	if (skipped) {
+		*offset_fsb = next_fsb;
+		goto out_trans_cancel;
+	}
+
+	xfs_trans_ijoin(tp, ip, 0);
+	error = xfs_bmapi_write(tp, ip, *offset_fsb, next_fsb - *offset_fsb,
+			XFS_BMAPI_COWFORK, &first_block,
+			XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK),
+			&imap, &nimaps, &dfops);
+	if (error)
+		goto out_trans_cancel;
+
+	/* We might not have been able to map the whole delalloc extent */
+	*offset_fsb = min(*offset_fsb + imap.br_blockcount, next_fsb);
+
+	error = xfs_defer_finish(&tp, &dfops, NULL);
+	if (error)
+		goto out_trans_cancel;
+
+	error = xfs_trans_commit(tp);
+
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+out_trans_cancel:
+	xfs_defer_cancel(&dfops);
+	xfs_trans_cancel(tp);
+	goto out_unlock;
+}
+
+/* Allocate all CoW reservations covering a part of a file. */
+int
+xfs_reflink_allocate_cow_range(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		count)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		offset_fsb = XFS_B_TO_FSBT(mp, offset);
+	xfs_fileoff_t		end_fsb = XFS_B_TO_FSB(mp, offset + count);
+	int			error;
+
+	ASSERT(xfs_is_reflink_inode(ip));
+
+	trace_xfs_reflink_allocate_cow_range(ip, offset, count);
+
+	/*
+	 * Make sure that the dquots are there.
+	 */
+	error = xfs_qm_dqattach(ip, 0);
+	if (error)
+		return error;
+
+	while (offset_fsb < end_fsb) {
+		error = __xfs_reflink_allocate_cow(ip, &offset_fsb, end_fsb);
+		if (error) {
+			trace_xfs_reflink_allocate_cow_range_error(ip, error,
+					_RET_IP_);
+			break;
+		}
+	}
+
+	return error;
+}
+
+/*
+ * Find the CoW reservation (and whether or not it needs block allocation)
+ * for a given byte offset of a file.
+ */
+bool
+xfs_reflink_find_cow_mapping(
+	struct xfs_inode		*ip,
+	xfs_off_t			offset,
+	struct xfs_bmbt_irec		*imap,
+	bool				*need_alloc)
+{
+	struct xfs_bmbt_irec		irec;
+	struct xfs_ifork		*ifp;
+	struct xfs_bmbt_rec_host	*gotp;
+	xfs_fileoff_t			bno;
+	xfs_extnum_t			idx;
+
+	ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL | XFS_ILOCK_SHARED));
+	ASSERT(xfs_is_reflink_inode(ip));
+
+	/* Find the extent in the CoW fork. */
+	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+	bno = XFS_B_TO_FSBT(ip->i_mount, offset);
+	gotp = xfs_iext_bno_to_ext(ifp, bno, &idx);
+	if (!gotp)
+		return false;
+
+	xfs_bmbt_get_all(gotp, &irec);
+	if (bno >= irec.br_startoff + irec.br_blockcount ||
+	    bno < irec.br_startoff)
+		return false;
+
+	trace_xfs_reflink_find_cow_mapping(ip, offset, 1, XFS_IO_OVERWRITE,
+			&irec);
+
+	/* If it's still delalloc, we must allocate later. */
+	*imap = irec;
+	*need_alloc = !!(isnullstartblock(irec.br_startblock));
+
+	return true;
+}
+
+/*
+ * Trim an extent to end at the next CoW reservation past offset_fsb.
+ */
+int
+xfs_reflink_trim_irec_to_next_cow(
+	struct xfs_inode		*ip,
+	xfs_fileoff_t			offset_fsb,
+	struct xfs_bmbt_irec		*imap)
+{
+	struct xfs_bmbt_irec		irec;
+	struct xfs_ifork		*ifp;
+	struct xfs_bmbt_rec_host	*gotp;
+	xfs_extnum_t			idx;
+
+	if (!xfs_is_reflink_inode(ip))
+		return 0;
+
+	/* Find the extent in the CoW fork. */
+	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+	gotp = xfs_iext_bno_to_ext(ifp, offset_fsb, &idx);
+	if (!gotp)
+		return 0;
+	xfs_bmbt_get_all(gotp, &irec);
+
+	/* This is the extent before; try sliding up one. */
+	if (irec.br_startoff < offset_fsb) {
+		idx++;
+		if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+			return 0;
+		gotp = xfs_iext_get_ext(ifp, idx);
+		xfs_bmbt_get_all(gotp, &irec);
+	}
+
+	if (irec.br_startoff >= imap->br_startoff + imap->br_blockcount)
+		return 0;
+
+	imap->br_blockcount = irec.br_startoff - imap->br_startoff;
+	trace_xfs_reflink_trim_irec(ip, imap);
+
+	return 0;
+}
+
+/*
+ * Cancel all pending CoW reservations for some block range of an inode.
+ */
+int
+xfs_reflink_cancel_cow_blocks(
+	struct xfs_inode		*ip,
+	struct xfs_trans		**tpp,
+	xfs_fileoff_t			offset_fsb,
+	xfs_fileoff_t			end_fsb)
+{
+	struct xfs_bmbt_irec		irec;
+	xfs_filblks_t			count_fsb;
+	xfs_fsblock_t			firstfsb;
+	struct xfs_defer_ops		dfops;
+	int				error = 0;
+	int				nimaps;
+
+	if (!xfs_is_reflink_inode(ip))
+		return 0;
+
+	/* Go find the old extent in the CoW fork. */
+	while (offset_fsb < end_fsb) {
+		nimaps = 1;
+		count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb);
+		error = xfs_bmapi_read(ip, offset_fsb, count_fsb, &irec,
+				&nimaps, XFS_BMAPI_COWFORK);
+		if (error)
+			break;
+		ASSERT(nimaps == 1);
+
+		trace_xfs_reflink_cancel_cow(ip, &irec);
+
+		if (irec.br_startblock == DELAYSTARTBLOCK) {
+			/* Free a delayed allocation. */
+			xfs_mod_fdblocks(ip->i_mount, irec.br_blockcount,
+					false);
+			ip->i_delayed_blks -= irec.br_blockcount;
+
+			/* Remove the mapping from the CoW fork. */
+			error = xfs_bunmapi_cow(ip, &irec);
+			if (error)
+				break;
+		} else if (irec.br_startblock == HOLESTARTBLOCK) {
+			/* empty */
+		} else {
+			xfs_trans_ijoin(*tpp, ip, 0);
+			xfs_defer_init(&dfops, &firstfsb);
+
+			/* Free the CoW orphan record. */
+			error = xfs_refcount_free_cow_extent(ip->i_mount,
+					&dfops, irec.br_startblock,
+					irec.br_blockcount);
+			if (error)
+				break;
+
+			xfs_bmap_add_free(ip->i_mount, &dfops,
+					irec.br_startblock, irec.br_blockcount,
+					NULL);
+
+			/* Update quota accounting */
+			xfs_trans_mod_dquot_byino(*tpp, ip, XFS_TRANS_DQ_BCOUNT,
+					-(long)irec.br_blockcount);
+
+			/* Roll the transaction */
+			error = xfs_defer_finish(tpp, &dfops, ip);
+			if (error) {
+				xfs_defer_cancel(&dfops);
+				break;
+			}
+
+			/* Remove the mapping from the CoW fork. */
+			error = xfs_bunmapi_cow(ip, &irec);
+			if (error)
+				break;
+		}
+
+		/* Roll on... */
+		offset_fsb = irec.br_startoff + irec.br_blockcount;
+	}
+
+	return error;
+}
+
+/*
+ * Cancel all pending CoW reservations for some byte range of an inode.
+ */
+int
+xfs_reflink_cancel_cow_range(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		count)
+{
+	struct xfs_trans	*tp;
+	xfs_fileoff_t		offset_fsb;
+	xfs_fileoff_t		end_fsb;
+	int			error;
+
+	trace_xfs_reflink_cancel_cow_range(ip, offset, count);
+	ASSERT(xfs_is_reflink_inode(ip));
+
+	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+	if (count == NULLFILEOFF)
+		end_fsb = NULLFILEOFF;
+	else
+		end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
+
+	/* Start a rolling transaction to remove the mappings */
+	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
+			0, 0, 0, &tp);
+	if (error)
+		goto out;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	/* Scrape out the old CoW reservations */
+	error = xfs_reflink_cancel_cow_blocks(ip, &tp, offset_fsb, end_fsb);
+	if (error)
+		goto out_cancel;
+
+	error = xfs_trans_commit(tp);
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+
+out_cancel:
+	xfs_trans_cancel(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out:
+	trace_xfs_reflink_cancel_cow_range_error(ip, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Remap parts of a file's data fork after a successful CoW.
+ */
+int
+xfs_reflink_end_cow(
+	struct xfs_inode		*ip,
+	xfs_off_t			offset,
+	xfs_off_t			count)
+{
+	struct xfs_bmbt_irec		irec;
+	struct xfs_bmbt_irec		uirec;
+	struct xfs_trans		*tp;
+	xfs_fileoff_t			offset_fsb;
+	xfs_fileoff_t			end_fsb;
+	xfs_filblks_t			count_fsb;
+	xfs_fsblock_t			firstfsb;
+	struct xfs_defer_ops		dfops;
+	int				error;
+	unsigned int			resblks;
+	xfs_filblks_t			ilen;
+	xfs_filblks_t			rlen;
+	int				nimaps;
+
+	trace_xfs_reflink_end_cow(ip, offset, count);
+
+	offset_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
+	end_fsb = XFS_B_TO_FSB(ip->i_mount, offset + count);
+	count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb);
+
+	/* Start a rolling transaction to switch the mappings */
+	resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+	error = xfs_trans_alloc(ip->i_mount, &M_RES(ip->i_mount)->tr_write,
+			resblks, 0, 0, &tp);
+	if (error)
+		goto out;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	/* Go find the old extent in the CoW fork. */
+	while (offset_fsb < end_fsb) {
+		/* Read extent from the source file */
+		nimaps = 1;
+		count_fsb = (xfs_filblks_t)(end_fsb - offset_fsb);
+		error = xfs_bmapi_read(ip, offset_fsb, count_fsb, &irec,
+				&nimaps, XFS_BMAPI_COWFORK);
+		if (error)
+			goto out_cancel;
+		ASSERT(nimaps == 1);
+
+		ASSERT(irec.br_startblock != DELAYSTARTBLOCK);
+		trace_xfs_reflink_cow_remap(ip, &irec);
+
+		/*
+		 * We can have a hole in the CoW fork if part of a directio
+		 * write is CoW but part of it isn't.
+		 */
+		rlen = ilen = irec.br_blockcount;
+		if (irec.br_startblock == HOLESTARTBLOCK)
+			goto next_extent;
+
+		/* Unmap the old blocks in the data fork. */
+		while (rlen) {
+			xfs_defer_init(&dfops, &firstfsb);
+			error = __xfs_bunmapi(tp, ip, irec.br_startoff,
+					&rlen, 0, 1, &firstfsb, &dfops);
+			if (error)
+				goto out_defer;
+
+			/*
+			 * Trim the extent to whatever got unmapped.
+			 * Remember, bunmapi works backwards.
+			 */
+			uirec.br_startblock = irec.br_startblock + rlen;
+			uirec.br_startoff = irec.br_startoff + rlen;
+			uirec.br_blockcount = irec.br_blockcount - rlen;
+			irec.br_blockcount = rlen;
+			trace_xfs_reflink_cow_remap_piece(ip, &uirec);
+
+			/* Free the CoW orphan record. */
+			error = xfs_refcount_free_cow_extent(tp->t_mountp,
+					&dfops, uirec.br_startblock,
+					uirec.br_blockcount);
+			if (error)
+				goto out_defer;
+
+			/* Map the new blocks into the data fork. */
+			error = xfs_bmap_map_extent(tp->t_mountp, &dfops,
+					ip, &uirec);
+			if (error)
+				goto out_defer;
+
+			/* Remove the mapping from the CoW fork. */
+			error = xfs_bunmapi_cow(ip, &uirec);
+			if (error)
+				goto out_defer;
+
+			error = xfs_defer_finish(&tp, &dfops, ip);
+			if (error)
+				goto out_defer;
+		}
+
+next_extent:
+		/* Roll on... */
+		offset_fsb = irec.br_startoff + ilen;
+	}
+
+	error = xfs_trans_commit(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	if (error)
+		goto out;
+	return 0;
+
+out_defer:
+	xfs_defer_cancel(&dfops);
+out_cancel:
+	xfs_trans_cancel(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out:
+	trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Free leftover CoW reservations that didn't get cleaned out.
+ */
+int
+xfs_reflink_recover_cow(
+	struct xfs_mount	*mp)
+{
+	xfs_agnumber_t		agno;
+	int			error = 0;
+
+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
+		return 0;
+
+	for (agno = 0; agno < mp->m_sb.sb_agcount; agno++) {
+		error = xfs_refcount_recover_cow_leftovers(mp, agno);
+		if (error)
+			break;
+	}
+
+	return error;
+}
+
+/*
+ * Reflinking (Block) Ranges of Two Files Together
+ *
+ * First, ensure that the reflink flag is set on both inodes.  The flag is an
+ * optimization to avoid unnecessary refcount btree lookups in the write path.
+ *
+ * Now we can iteratively remap the range of extents (and holes) in src to the
+ * corresponding ranges in dest.  Let drange and srange denote the ranges of
+ * logical blocks in dest and src touched by the reflink operation.
+ *
+ * While the length of drange is greater than zero,
+ *    - Read src's bmbt at the start of srange ("imap")
+ *    - If imap doesn't exist, make imap appear to start at the end of srange
+ *      with zero length.
+ *    - If imap starts before srange, advance imap to start at srange.
+ *    - If imap goes beyond srange, truncate imap to end at the end of srange.
+ *    - Punch (imap start - srange start + imap len) blocks from dest at
+ *      offset (drange start).
+ *    - If imap points to a real range of pblks,
+ *         > Increase the refcount of the imap's pblks
+ *         > Map imap's pblks into dest at the offset
+ *           (drange start + imap start - srange start)
+ *    - Advance drange and srange by (imap start - srange start + imap len)
+ *
+ * Finally, if the reflink made dest longer, update both the in-core and
+ * on-disk file sizes.
+ *
+ * ASCII Art Demonstration:
+ *
+ * Let's say we want to reflink this source file:
+ *
+ * ----SSSSSSS-SSSSS----SSSSSS (src file)
+ *   <-------------------->
+ *
+ * into this destination file:
+ *
+ * --DDDDDDDDDDDDDDDDDDD--DDD (dest file)
+ *        <-------------------->
+ * '-' means a hole, and 'S' and 'D' are written blocks in the src and dest.
+ * Observe that the range has different logical offsets in either file.
+ *
+ * Consider that the first extent in the source file doesn't line up with our
+ * reflink range.  Unmapping  and remapping are separate operations, so we can
+ * unmap more blocks from the destination file than we remap.
+ *
+ * ----SSSSSSS-SSSSS----SSSSSS
+ *   <------->
+ * --DDDDD---------DDDDD--DDD
+ *        <------->
+ *
+ * Now remap the source extent into the destination file:
+ *
+ * ----SSSSSSS-SSSSS----SSSSSS
+ *   <------->
+ * --DDDDD--SSSSSSSDDDDD--DDD
+ *        <------->
+ *
+ * Do likewise with the second hole and extent in our range.  Holes in the
+ * unmap range don't affect our operation.
+ *
+ * ----SSSSSSS-SSSSS----SSSSSS
+ *            <---->
+ * --DDDDD--SSSSSSS-SSSSS-DDD
+ *                 <---->
+ *
+ * Finally, unmap and remap part of the third extent.  This will increase the
+ * size of the destination file.
+ *
+ * ----SSSSSSS-SSSSS----SSSSSS
+ *                  <----->
+ * --DDDDD--SSSSSSS-SSSSS----SSS
+ *                       <----->
+ *
+ * Once we update the destination file's i_size, we're done.
+ */
+
+/*
+ * Ensure the reflink bit is set in both inodes.
+ */
+STATIC int
+xfs_reflink_set_inode_flag(
+	struct xfs_inode	*src,
+	struct xfs_inode	*dest)
+{
+	struct xfs_mount	*mp = src->i_mount;
+	int			error;
+	struct xfs_trans	*tp;
+
+	if (xfs_is_reflink_inode(src) && xfs_is_reflink_inode(dest))
+		return 0;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+	if (error)
+		goto out_error;
+
+	/* Lock both files against IO */
+	if (src->i_ino == dest->i_ino)
+		xfs_ilock(src, XFS_ILOCK_EXCL);
+	else
+		xfs_lock_two_inodes(src, dest, XFS_ILOCK_EXCL);
+
+	if (!xfs_is_reflink_inode(src)) {
+		trace_xfs_reflink_set_inode_flag(src);
+		xfs_trans_ijoin(tp, src, XFS_ILOCK_EXCL);
+		src->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
+		xfs_trans_log_inode(tp, src, XFS_ILOG_CORE);
+		xfs_ifork_init_cow(src);
+	} else
+		xfs_iunlock(src, XFS_ILOCK_EXCL);
+
+	if (src->i_ino == dest->i_ino)
+		goto commit_flags;
+
+	if (!xfs_is_reflink_inode(dest)) {
+		trace_xfs_reflink_set_inode_flag(dest);
+		xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
+		dest->i_d.di_flags2 |= XFS_DIFLAG2_REFLINK;
+		xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
+		xfs_ifork_init_cow(dest);
+	} else
+		xfs_iunlock(dest, XFS_ILOCK_EXCL);
+
+commit_flags:
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto out_error;
+	return error;
+
+out_error:
+	trace_xfs_reflink_set_inode_flag_error(dest, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Update destination inode size & cowextsize hint, if necessary.
+ */
+STATIC int
+xfs_reflink_update_dest(
+	struct xfs_inode	*dest,
+	xfs_off_t		newlen,
+	xfs_extlen_t		cowextsize)
+{
+	struct xfs_mount	*mp = dest->i_mount;
+	struct xfs_trans	*tp;
+	int			error;
+
+	if (newlen <= i_size_read(VFS_I(dest)) && cowextsize == 0)
+		return 0;
+
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_ichange, 0, 0, 0, &tp);
+	if (error)
+		goto out_error;
+
+	xfs_ilock(dest, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, dest, XFS_ILOCK_EXCL);
+
+	if (newlen > i_size_read(VFS_I(dest))) {
+		trace_xfs_reflink_update_inode_size(dest, newlen);
+		i_size_write(VFS_I(dest), newlen);
+		dest->i_d.di_size = newlen;
+	}
+
+	if (cowextsize) {
+		dest->i_d.di_cowextsize = cowextsize;
+		dest->i_d.di_flags2 |= XFS_DIFLAG2_COWEXTSIZE;
+	}
+
+	xfs_trans_log_inode(tp, dest, XFS_ILOG_CORE);
+
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto out_error;
+	return error;
+
+out_error:
+	trace_xfs_reflink_update_inode_size_error(dest, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Do we have enough reserve in this AG to handle a reflink?  The refcount
+ * btree already reserved all the space it needs, but the rmap btree can grow
+ * infinitely, so we won't allow more reflinks when the AG is down to the
+ * btree reserves.
+ */
+static int
+xfs_reflink_ag_has_free_space(
+	struct xfs_mount	*mp,
+	xfs_agnumber_t		agno)
+{
+	struct xfs_perag	*pag;
+	int			error = 0;
+
+	if (!xfs_sb_version_hasrmapbt(&mp->m_sb))
+		return 0;
+
+	pag = xfs_perag_get(mp, agno);
+	if (xfs_ag_resv_critical(pag, XFS_AG_RESV_AGFL) ||
+	    xfs_ag_resv_critical(pag, XFS_AG_RESV_METADATA))
+		error = -ENOSPC;
+	xfs_perag_put(pag);
+	return error;
+}
+
+/*
+ * Unmap a range of blocks from a file, then map other blocks into the hole.
+ * The range to unmap is (destoff : destoff + srcioff + irec->br_blockcount).
+ * The extent irec is mapped into dest at irec->br_startoff.
+ */
+STATIC int
+xfs_reflink_remap_extent(
+	struct xfs_inode	*ip,
+	struct xfs_bmbt_irec	*irec,
+	xfs_fileoff_t		destoff,
+	xfs_off_t		new_isize)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	xfs_fsblock_t		firstfsb;
+	unsigned int		resblks;
+	struct xfs_defer_ops	dfops;
+	struct xfs_bmbt_irec	uirec;
+	bool			real_extent;
+	xfs_filblks_t		rlen;
+	xfs_filblks_t		unmap_len;
+	xfs_off_t		newlen;
+	int			error;
+
+	unmap_len = irec->br_startoff + irec->br_blockcount - destoff;
+	trace_xfs_reflink_punch_range(ip, destoff, unmap_len);
+
+	/* Only remap normal extents. */
+	real_extent =  (irec->br_startblock != HOLESTARTBLOCK &&
+			irec->br_startblock != DELAYSTARTBLOCK &&
+			!ISUNWRITTEN(irec));
+
+	/* No reflinking if we're low on space */
+	if (real_extent) {
+		error = xfs_reflink_ag_has_free_space(mp,
+				XFS_FSB_TO_AGNO(mp, irec->br_startblock));
+		if (error)
+			goto out;
+	}
+
+	/* Start a rolling transaction to switch the mappings */
+	resblks = XFS_EXTENTADD_SPACE_RES(ip->i_mount, XFS_DATA_FORK);
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, 0, &tp);
+	if (error)
+		goto out;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	/* If we're not just clearing space, then do we have enough quota? */
+	if (real_extent) {
+		error = xfs_trans_reserve_quota_nblks(tp, ip,
+				irec->br_blockcount, 0, XFS_QMOPT_RES_REGBLKS);
+		if (error)
+			goto out_cancel;
+	}
+
+	trace_xfs_reflink_remap(ip, irec->br_startoff,
+				irec->br_blockcount, irec->br_startblock);
+
+	/* Unmap the old blocks in the data fork. */
+	rlen = unmap_len;
+	while (rlen) {
+		xfs_defer_init(&dfops, &firstfsb);
+		error = __xfs_bunmapi(tp, ip, destoff, &rlen, 0, 1,
+				&firstfsb, &dfops);
+		if (error)
+			goto out_defer;
+
+		/*
+		 * Trim the extent to whatever got unmapped.
+		 * Remember, bunmapi works backwards.
+		 */
+		uirec.br_startblock = irec->br_startblock + rlen;
+		uirec.br_startoff = irec->br_startoff + rlen;
+		uirec.br_blockcount = unmap_len - rlen;
+		unmap_len = rlen;
+
+		/* If this isn't a real mapping, we're done. */
+		if (!real_extent || uirec.br_blockcount == 0)
+			goto next_extent;
+
+		trace_xfs_reflink_remap(ip, uirec.br_startoff,
+				uirec.br_blockcount, uirec.br_startblock);
+
+		/* Update the refcount tree */
+		error = xfs_refcount_increase_extent(mp, &dfops, &uirec);
+		if (error)
+			goto out_defer;
+
+		/* Map the new blocks into the data fork. */
+		error = xfs_bmap_map_extent(mp, &dfops, ip, &uirec);
+		if (error)
+			goto out_defer;
+
+		/* Update quota accounting. */
+		xfs_trans_mod_dquot_byino(tp, ip, XFS_TRANS_DQ_BCOUNT,
+				uirec.br_blockcount);
+
+		/* Update dest isize if needed. */
+		newlen = XFS_FSB_TO_B(mp,
+				uirec.br_startoff + uirec.br_blockcount);
+		newlen = min_t(xfs_off_t, newlen, new_isize);
+		if (newlen > i_size_read(VFS_I(ip))) {
+			trace_xfs_reflink_update_inode_size(ip, newlen);
+			i_size_write(VFS_I(ip), newlen);
+			ip->i_d.di_size = newlen;
+			xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
+		}
+
+next_extent:
+		/* Process all the deferred stuff. */
+		error = xfs_defer_finish(&tp, &dfops, ip);
+		if (error)
+			goto out_defer;
+	}
+
+	error = xfs_trans_commit(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	if (error)
+		goto out;
+	return 0;
+
+out_defer:
+	xfs_defer_cancel(&dfops);
+out_cancel:
+	xfs_trans_cancel(tp);
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out:
+	trace_xfs_reflink_remap_extent_error(ip, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Iteratively remap one file's extents (and holes) to another's.
+ */
+STATIC int
+xfs_reflink_remap_blocks(
+	struct xfs_inode	*src,
+	xfs_fileoff_t		srcoff,
+	struct xfs_inode	*dest,
+	xfs_fileoff_t		destoff,
+	xfs_filblks_t		len,
+	xfs_off_t		new_isize)
+{
+	struct xfs_bmbt_irec	imap;
+	int			nimaps;
+	int			error = 0;
+	xfs_filblks_t		range_len;
+
+	/* drange = (destoff, destoff + len); srange = (srcoff, srcoff + len) */
+	while (len) {
+		trace_xfs_reflink_remap_blocks_loop(src, srcoff, len,
+				dest, destoff);
+		/* Read extent from the source file */
+		nimaps = 1;
+		xfs_ilock(src, XFS_ILOCK_EXCL);
+		error = xfs_bmapi_read(src, srcoff, len, &imap, &nimaps, 0);
+		xfs_iunlock(src, XFS_ILOCK_EXCL);
+		if (error)
+			goto err;
+		ASSERT(nimaps == 1);
+
+		trace_xfs_reflink_remap_imap(src, srcoff, len, XFS_IO_OVERWRITE,
+				&imap);
+
+		/* Translate imap into the destination file. */
+		range_len = imap.br_startoff + imap.br_blockcount - srcoff;
+		imap.br_startoff += destoff - srcoff;
+
+		/* Clear dest from destoff to the end of imap and map it in. */
+		error = xfs_reflink_remap_extent(dest, &imap, destoff,
+				new_isize);
+		if (error)
+			goto err;
+
+		if (fatal_signal_pending(current)) {
+			error = -EINTR;
+			goto err;
+		}
+
+		/* Advance drange/srange */
+		srcoff += range_len;
+		destoff += range_len;
+		len -= range_len;
+	}
+
+	return 0;
+
+err:
+	trace_xfs_reflink_remap_blocks_error(dest, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Read a page's worth of file data into the page cache.  Return the page
+ * locked.
+ */
+static struct page *
+xfs_get_page(
+	struct inode	*inode,
+	xfs_off_t	offset)
+{
+	struct address_space	*mapping;
+	struct page		*page;
+	pgoff_t			n;
+
+	n = offset >> PAGE_SHIFT;
+	mapping = inode->i_mapping;
+	page = read_mapping_page(mapping, n, NULL);
+	if (IS_ERR(page))
+		return page;
+	if (!PageUptodate(page)) {
+		put_page(page);
+		return ERR_PTR(-EIO);
+	}
+	lock_page(page);
+	return page;
+}
+
+/*
+ * Compare extents of two files to see if they are the same.
+ */
+static int
+xfs_compare_extents(
+	struct inode	*src,
+	xfs_off_t	srcoff,
+	struct inode	*dest,
+	xfs_off_t	destoff,
+	xfs_off_t	len,
+	bool		*is_same)
+{
+	xfs_off_t	src_poff;
+	xfs_off_t	dest_poff;
+	void		*src_addr;
+	void		*dest_addr;
+	struct page	*src_page;
+	struct page	*dest_page;
+	xfs_off_t	cmp_len;
+	bool		same;
+	int		error;
+
+	error = -EINVAL;
+	same = true;
+	while (len) {
+		src_poff = srcoff & (PAGE_SIZE - 1);
+		dest_poff = destoff & (PAGE_SIZE - 1);
+		cmp_len = min(PAGE_SIZE - src_poff,
+			      PAGE_SIZE - dest_poff);
+		cmp_len = min(cmp_len, len);
+		ASSERT(cmp_len > 0);
+
+		trace_xfs_reflink_compare_extents(XFS_I(src), srcoff, cmp_len,
+				XFS_I(dest), destoff);
+
+		src_page = xfs_get_page(src, srcoff);
+		if (IS_ERR(src_page)) {
+			error = PTR_ERR(src_page);
+			goto out_error;
+		}
+		dest_page = xfs_get_page(dest, destoff);
+		if (IS_ERR(dest_page)) {
+			error = PTR_ERR(dest_page);
+			unlock_page(src_page);
+			put_page(src_page);
+			goto out_error;
+		}
+		src_addr = kmap_atomic(src_page);
+		dest_addr = kmap_atomic(dest_page);
+
+		flush_dcache_page(src_page);
+		flush_dcache_page(dest_page);
+
+		if (memcmp(src_addr + src_poff, dest_addr + dest_poff, cmp_len))
+			same = false;
+
+		kunmap_atomic(dest_addr);
+		kunmap_atomic(src_addr);
+		unlock_page(dest_page);
+		unlock_page(src_page);
+		put_page(dest_page);
+		put_page(src_page);
+
+		if (!same)
+			break;
+
+		srcoff += cmp_len;
+		destoff += cmp_len;
+		len -= cmp_len;
+	}
+
+	*is_same = same;
+	return 0;
+
+out_error:
+	trace_xfs_reflink_compare_extents_error(XFS_I(dest), error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Link a range of blocks from one file to another.
+ */
+int
+xfs_reflink_remap_range(
+	struct xfs_inode	*src,
+	xfs_off_t		srcoff,
+	struct xfs_inode	*dest,
+	xfs_off_t		destoff,
+	xfs_off_t		len,
+	unsigned int		flags)
+{
+	struct xfs_mount	*mp = src->i_mount;
+	xfs_fileoff_t		sfsbno, dfsbno;
+	xfs_filblks_t		fsblen;
+	int			error;
+	xfs_extlen_t		cowextsize;
+	bool			is_same;
+
+	if (!xfs_sb_version_hasreflink(&mp->m_sb))
+		return -EOPNOTSUPP;
+
+	if (XFS_FORCED_SHUTDOWN(mp))
+		return -EIO;
+
+	/* Don't reflink realtime inodes */
+	if (XFS_IS_REALTIME_INODE(src) || XFS_IS_REALTIME_INODE(dest))
+		return -EINVAL;
+
+	if (flags & ~XFS_REFLINK_ALL)
+		return -EINVAL;
+
+	trace_xfs_reflink_remap_range(src, srcoff, len, dest, destoff);
+
+	/* Lock both files against IO */
+	if (src->i_ino == dest->i_ino) {
+		xfs_ilock(src, XFS_IOLOCK_EXCL);
+		xfs_ilock(src, XFS_MMAPLOCK_EXCL);
+	} else {
+		xfs_lock_two_inodes(src, dest, XFS_IOLOCK_EXCL);
+		xfs_lock_two_inodes(src, dest, XFS_MMAPLOCK_EXCL);
+	}
+
+	/*
+	 * Check that the extents are the same.
+	 */
+	if (flags & XFS_REFLINK_DEDUPE) {
+		is_same = false;
+		error = xfs_compare_extents(VFS_I(src), srcoff, VFS_I(dest),
+				destoff, len, &is_same);
+		if (error)
+			goto out_error;
+		if (!is_same) {
+			error = -EBADE;
+			goto out_error;
+		}
+	}
+
+	error = xfs_reflink_set_inode_flag(src, dest);
+	if (error)
+		goto out_error;
+
+	/*
+	 * Invalidate the page cache so that we can clear any CoW mappings
+	 * in the destination file.
+	 */
+	truncate_inode_pages_range(&VFS_I(dest)->i_data, destoff,
+				   PAGE_ALIGN(destoff + len) - 1);
+
+	dfsbno = XFS_B_TO_FSBT(mp, destoff);
+	sfsbno = XFS_B_TO_FSBT(mp, srcoff);
+	fsblen = XFS_B_TO_FSB(mp, len);
+	error = xfs_reflink_remap_blocks(src, sfsbno, dest, dfsbno, fsblen,
+			destoff + len);
+	if (error)
+		goto out_error;
+
+	/*
+	 * Carry the cowextsize hint from src to dest if we're sharing the
+	 * entire source file to the entire destination file, the source file
+	 * has a cowextsize hint, and the destination file does not.
+	 */
+	cowextsize = 0;
+	if (srcoff == 0 && len == i_size_read(VFS_I(src)) &&
+	    (src->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE) &&
+	    destoff == 0 && len >= i_size_read(VFS_I(dest)) &&
+	    !(dest->i_d.di_flags2 & XFS_DIFLAG2_COWEXTSIZE))
+		cowextsize = src->i_d.di_cowextsize;
+
+	error = xfs_reflink_update_dest(dest, destoff + len, cowextsize);
+	if (error)
+		goto out_error;
+
+out_error:
+	xfs_iunlock(src, XFS_MMAPLOCK_EXCL);
+	xfs_iunlock(src, XFS_IOLOCK_EXCL);
+	if (src->i_ino != dest->i_ino) {
+		xfs_iunlock(dest, XFS_MMAPLOCK_EXCL);
+		xfs_iunlock(dest, XFS_IOLOCK_EXCL);
+	}
+	if (error)
+		trace_xfs_reflink_remap_range_error(dest, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * The user wants to preemptively CoW all shared blocks in this file,
+ * which enables us to turn off the reflink flag.  Iterate all
+ * extents which are not prealloc/delalloc to see which ranges are
+ * mentioned in the refcount tree, then read those blocks into the
+ * pagecache, dirty them, fsync them back out, and then we can update
+ * the inode flag.  What happens if we run out of memory? :)
+ */
+STATIC int
+xfs_reflink_dirty_extents(
+	struct xfs_inode	*ip,
+	xfs_fileoff_t		fbno,
+	xfs_filblks_t		end,
+	xfs_off_t		isize)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_agnumber_t		agno;
+	xfs_agblock_t		agbno;
+	xfs_extlen_t		aglen;
+	xfs_agblock_t		rbno;
+	xfs_extlen_t		rlen;
+	xfs_off_t		fpos;
+	xfs_off_t		flen;
+	struct xfs_bmbt_irec	map[2];
+	int			nmaps;
+	int			error = 0;
+
+	while (end - fbno > 0) {
+		nmaps = 1;
+		/*
+		 * Look for extents in the file.  Skip holes, delalloc, or
+		 * unwritten extents; they can't be reflinked.
+		 */
+		error = xfs_bmapi_read(ip, fbno, end - fbno, map, &nmaps, 0);
+		if (error)
+			goto out;
+		if (nmaps == 0)
+			break;
+		if (map[0].br_startblock == HOLESTARTBLOCK ||
+		    map[0].br_startblock == DELAYSTARTBLOCK ||
+		    ISUNWRITTEN(&map[0]))
+			goto next;
+
+		map[1] = map[0];
+		while (map[1].br_blockcount) {
+			agno = XFS_FSB_TO_AGNO(mp, map[1].br_startblock);
+			agbno = XFS_FSB_TO_AGBNO(mp, map[1].br_startblock);
+			aglen = map[1].br_blockcount;
+
+			error = xfs_reflink_find_shared(mp, agno, agbno, aglen,
+					&rbno, &rlen, true);
+			if (error)
+				goto out;
+			if (rbno == NULLAGBLOCK)
+				break;
+
+			/* Dirty the pages */
+			xfs_iunlock(ip, XFS_ILOCK_EXCL);
+			fpos = XFS_FSB_TO_B(mp, map[1].br_startoff +
+					(rbno - agbno));
+			flen = XFS_FSB_TO_B(mp, rlen);
+			if (fpos + flen > isize)
+				flen = isize - fpos;
+			error = iomap_file_dirty(VFS_I(ip), fpos, flen,
+					&xfs_iomap_ops);
+			xfs_ilock(ip, XFS_ILOCK_EXCL);
+			if (error)
+				goto out;
+
+			map[1].br_blockcount -= (rbno - agbno + rlen);
+			map[1].br_startoff += (rbno - agbno + rlen);
+			map[1].br_startblock += (rbno - agbno + rlen);
+		}
+
+next:
+		fbno = map[0].br_startoff + map[0].br_blockcount;
+	}
+out:
+	return error;
+}
+
+/* Clear the inode reflink flag if there are no shared extents. */
+int
+xfs_reflink_clear_inode_flag(
+	struct xfs_inode	*ip,
+	struct xfs_trans	**tpp)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		fbno;
+	xfs_filblks_t		end;
+	xfs_agnumber_t		agno;
+	xfs_agblock_t		agbno;
+	xfs_extlen_t		aglen;
+	xfs_agblock_t		rbno;
+	xfs_extlen_t		rlen;
+	struct xfs_bmbt_irec	map;
+	int			nmaps;
+	int			error = 0;
+
+	ASSERT(xfs_is_reflink_inode(ip));
+
+	fbno = 0;
+	end = XFS_B_TO_FSB(mp, i_size_read(VFS_I(ip)));
+	while (end - fbno > 0) {
+		nmaps = 1;
+		/*
+		 * Look for extents in the file.  Skip holes, delalloc, or
+		 * unwritten extents; they can't be reflinked.
+		 */
+		error = xfs_bmapi_read(ip, fbno, end - fbno, &map, &nmaps, 0);
+		if (error)
+			return error;
+		if (nmaps == 0)
+			break;
+		if (map.br_startblock == HOLESTARTBLOCK ||
+		    map.br_startblock == DELAYSTARTBLOCK ||
+		    ISUNWRITTEN(&map))
+			goto next;
+
+		agno = XFS_FSB_TO_AGNO(mp, map.br_startblock);
+		agbno = XFS_FSB_TO_AGBNO(mp, map.br_startblock);
+		aglen = map.br_blockcount;
+
+		error = xfs_reflink_find_shared(mp, agno, agbno, aglen,
+				&rbno, &rlen, false);
+		if (error)
+			return error;
+		/* Is there still a shared block here? */
+		if (rbno != NULLAGBLOCK)
+			return 0;
+next:
+		fbno = map.br_startoff + map.br_blockcount;
+	}
+
+	/*
+	 * We didn't find any shared blocks so turn off the reflink flag.
+	 * First, get rid of any leftover CoW mappings.
+	 */
+	error = xfs_reflink_cancel_cow_blocks(ip, tpp, 0, NULLFILEOFF);
+	if (error)
+		return error;
+
+	/* Clear the inode flag. */
+	trace_xfs_reflink_unset_inode_flag(ip);
+	ip->i_d.di_flags2 &= ~XFS_DIFLAG2_REFLINK;
+	xfs_inode_clear_cowblocks_tag(ip);
+	xfs_trans_ijoin(*tpp, ip, 0);
+	xfs_trans_log_inode(*tpp, ip, XFS_ILOG_CORE);
+
+	return error;
+}
+
+/*
+ * Clear the inode reflink flag if there are no shared extents and the size
+ * hasn't changed.
+ */
+STATIC int
+xfs_reflink_try_clear_inode_flag(
+	struct xfs_inode	*ip)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	struct xfs_trans	*tp;
+	int			error = 0;
+
+	/* Start a rolling transaction to remove the mappings */
+	error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, 0, 0, 0, &tp);
+	if (error)
+		return error;
+
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	xfs_trans_ijoin(tp, ip, 0);
+
+	error = xfs_reflink_clear_inode_flag(ip, &tp);
+	if (error)
+		goto cancel;
+
+	error = xfs_trans_commit(tp);
+	if (error)
+		goto out;
+
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return 0;
+cancel:
+	xfs_trans_cancel(tp);
+out:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+	return error;
+}
+
+/*
+ * Pre-COW all shared blocks within a given byte range of a file and turn off
+ * the reflink flag if we unshare all of the file's blocks.
+ */
+int
+xfs_reflink_unshare(
+	struct xfs_inode	*ip,
+	xfs_off_t		offset,
+	xfs_off_t		len)
+{
+	struct xfs_mount	*mp = ip->i_mount;
+	xfs_fileoff_t		fbno;
+	xfs_filblks_t		end;
+	xfs_off_t		isize;
+	int			error;
+
+	if (!xfs_is_reflink_inode(ip))
+		return 0;
+
+	trace_xfs_reflink_unshare(ip, offset, len);
+
+	inode_dio_wait(VFS_I(ip));
+
+	/* Try to CoW the selected ranges */
+	xfs_ilock(ip, XFS_ILOCK_EXCL);
+	fbno = XFS_B_TO_FSBT(mp, offset);
+	isize = i_size_read(VFS_I(ip));
+	end = XFS_B_TO_FSB(mp, offset + len);
+	error = xfs_reflink_dirty_extents(ip, fbno, end, isize);
+	if (error)
+		goto out_unlock;
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+
+	/* Wait for the IO to finish */
+	error = filemap_write_and_wait(VFS_I(ip)->i_mapping);
+	if (error)
+		goto out;
+
+	/* Turn off the reflink flag if possible. */
+	error = xfs_reflink_try_clear_inode_flag(ip);
+	if (error)
+		goto out;
+
+	return 0;
+
+out_unlock:
+	xfs_iunlock(ip, XFS_ILOCK_EXCL);
+out:
+	trace_xfs_reflink_unshare_error(ip, error, _RET_IP_);
+	return error;
+}
+
+/*
+ * Does this inode have any real CoW reservations?
+ */
+bool
+xfs_reflink_has_real_cow_blocks(
+	struct xfs_inode		*ip)
+{
+	struct xfs_bmbt_irec		irec;
+	struct xfs_ifork		*ifp;
+	struct xfs_bmbt_rec_host	*gotp;
+	xfs_extnum_t			idx;
+
+	if (!xfs_is_reflink_inode(ip))
+		return false;
+
+	/* Go find the old extent in the CoW fork. */
+	ifp = XFS_IFORK_PTR(ip, XFS_COW_FORK);
+	gotp = xfs_iext_bno_to_ext(ifp, 0, &idx);
+	while (gotp) {
+		xfs_bmbt_get_all(gotp, &irec);
+
+		if (!isnullstartblock(irec.br_startblock))
+			return true;
+
+		/* Roll on... */
+		idx++;
+		if (idx >= ifp->if_bytes / sizeof(xfs_bmbt_rec_t))
+			break;
+		gotp = xfs_iext_get_ext(ifp, idx);
+	}
+
+	return false;
+}
diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h
new file mode 100644
index 000000000000..5dc3c8ac12aa
--- /dev/null
+++ b/fs/xfs/xfs_reflink.h
@@ -0,0 +1,58 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#ifndef __XFS_REFLINK_H
+#define __XFS_REFLINK_H 1
+
+extern int xfs_reflink_find_shared(struct xfs_mount *mp, xfs_agnumber_t agno,
+		xfs_agblock_t agbno, xfs_extlen_t aglen, xfs_agblock_t *fbno,
+		xfs_extlen_t *flen, bool find_maximal);
+extern int xfs_reflink_trim_around_shared(struct xfs_inode *ip,
+		struct xfs_bmbt_irec *irec, bool *shared, bool *trimmed);
+
+extern int xfs_reflink_reserve_cow_range(struct xfs_inode *ip,
+		xfs_off_t offset, xfs_off_t count);
+extern int xfs_reflink_allocate_cow_range(struct xfs_inode *ip,
+		xfs_off_t offset, xfs_off_t count);
+extern bool xfs_reflink_find_cow_mapping(struct xfs_inode *ip, xfs_off_t offset,
+		struct xfs_bmbt_irec *imap, bool *need_alloc);
+extern int xfs_reflink_trim_irec_to_next_cow(struct xfs_inode *ip,
+		xfs_fileoff_t offset_fsb, struct xfs_bmbt_irec *imap);
+
+extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip,
+		struct xfs_trans **tpp, xfs_fileoff_t offset_fsb,
+		xfs_fileoff_t end_fsb);
+extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset,
+		xfs_off_t count);
+extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset,
+		xfs_off_t count);
+extern int xfs_reflink_recover_cow(struct xfs_mount *mp);
+#define XFS_REFLINK_DEDUPE	1	/* only reflink if contents match */
+#define XFS_REFLINK_ALL		(XFS_REFLINK_DEDUPE)
+extern int xfs_reflink_remap_range(struct xfs_inode *src, xfs_off_t srcoff,
+		struct xfs_inode *dest, xfs_off_t destoff, xfs_off_t len,
+		unsigned int flags);
+extern int xfs_reflink_clear_inode_flag(struct xfs_inode *ip,
+		struct xfs_trans **tpp);
+extern int xfs_reflink_unshare(struct xfs_inode *ip, xfs_off_t offset,
+		xfs_off_t len);
+
+extern bool xfs_reflink_has_real_cow_blocks(struct xfs_inode *ip);
+
+#endif /* __XFS_REFLINK_H */
diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c
index 0432a459871c..73c827831551 100644
--- a/fs/xfs/xfs_rmap_item.c
+++ b/fs/xfs/xfs_rmap_item.c
@@ -441,8 +441,11 @@ xfs_rui_recover(
 				   XFS_FSB_TO_DADDR(mp, rmap->me_startblock));
 		switch (rmap->me_flags & XFS_RMAP_EXTENT_TYPE_MASK) {
 		case XFS_RMAP_EXTENT_MAP:
+		case XFS_RMAP_EXTENT_MAP_SHARED:
 		case XFS_RMAP_EXTENT_UNMAP:
+		case XFS_RMAP_EXTENT_UNMAP_SHARED:
 		case XFS_RMAP_EXTENT_CONVERT:
+		case XFS_RMAP_EXTENT_CONVERT_SHARED:
 		case XFS_RMAP_EXTENT_ALLOC:
 		case XFS_RMAP_EXTENT_FREE:
 			op_ok = true;
@@ -481,12 +484,21 @@ xfs_rui_recover(
 		case XFS_RMAP_EXTENT_MAP:
 			type = XFS_RMAP_MAP;
 			break;
+		case XFS_RMAP_EXTENT_MAP_SHARED:
+			type = XFS_RMAP_MAP_SHARED;
+			break;
 		case XFS_RMAP_EXTENT_UNMAP:
 			type = XFS_RMAP_UNMAP;
 			break;
+		case XFS_RMAP_EXTENT_UNMAP_SHARED:
+			type = XFS_RMAP_UNMAP_SHARED;
+			break;
 		case XFS_RMAP_EXTENT_CONVERT:
 			type = XFS_RMAP_CONVERT;
 			break;
+		case XFS_RMAP_EXTENT_CONVERT_SHARED:
+			type = XFS_RMAP_CONVERT_SHARED;
+			break;
 		case XFS_RMAP_EXTENT_ALLOC:
 			type = XFS_RMAP_ALLOC;
 			break;
diff --git a/fs/xfs/xfs_stats.c b/fs/xfs/xfs_stats.c
index 6e812fe0fd43..12d48cd8f8a4 100644
--- a/fs/xfs/xfs_stats.c
+++ b/fs/xfs/xfs_stats.c
@@ -62,6 +62,7 @@ int xfs_stats_format(struct xfsstats __percpu *stats, char *buf)
 		{ "ibt2",		XFSSTAT_END_IBT_V2		},
 		{ "fibt2",		XFSSTAT_END_FIBT_V2		},
 		{ "rmapbt",		XFSSTAT_END_RMAP_V2		},
+		{ "refcntbt",		XFSSTAT_END_REFCOUNT		},
 		/* we print both series of quota information together */
 		{ "qm",			XFSSTAT_END_QM			},
 	};
diff --git a/fs/xfs/xfs_stats.h b/fs/xfs/xfs_stats.h
index 657865f51e78..79ad2e69fc33 100644
--- a/fs/xfs/xfs_stats.h
+++ b/fs/xfs/xfs_stats.h
@@ -213,7 +213,23 @@ struct xfsstats {
 	__uint32_t		xs_rmap_2_alloc;
 	__uint32_t		xs_rmap_2_free;
 	__uint32_t		xs_rmap_2_moves;
-#define XFSSTAT_END_XQMSTAT		(XFSSTAT_END_RMAP_V2+6)
+#define XFSSTAT_END_REFCOUNT		(XFSSTAT_END_RMAP_V2 + 15)
+	__uint32_t		xs_refcbt_2_lookup;
+	__uint32_t		xs_refcbt_2_compare;
+	__uint32_t		xs_refcbt_2_insrec;
+	__uint32_t		xs_refcbt_2_delrec;
+	__uint32_t		xs_refcbt_2_newroot;
+	__uint32_t		xs_refcbt_2_killroot;
+	__uint32_t		xs_refcbt_2_increment;
+	__uint32_t		xs_refcbt_2_decrement;
+	__uint32_t		xs_refcbt_2_lshift;
+	__uint32_t		xs_refcbt_2_rshift;
+	__uint32_t		xs_refcbt_2_split;
+	__uint32_t		xs_refcbt_2_join;
+	__uint32_t		xs_refcbt_2_alloc;
+	__uint32_t		xs_refcbt_2_free;
+	__uint32_t		xs_refcbt_2_moves;
+#define XFSSTAT_END_XQMSTAT		(XFSSTAT_END_REFCOUNT + 6)
 	__uint32_t		xs_qm_dqreclaims;
 	__uint32_t		xs_qm_dqreclaim_misses;
 	__uint32_t		xs_qm_dquot_dups;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index 2d092f9577ca..ade4691e3f74 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -47,6 +47,9 @@
 #include "xfs_sysfs.h"
 #include "xfs_ondisk.h"
 #include "xfs_rmap_item.h"
+#include "xfs_refcount_item.h"
+#include "xfs_bmap_item.h"
+#include "xfs_reflink.h"
 
 #include <linux/namei.h>
 #include <linux/init.h>
@@ -936,6 +939,7 @@ xfs_fs_destroy_inode(
 	struct inode		*inode)
 {
 	struct xfs_inode	*ip = XFS_I(inode);
+	int			error;
 
 	trace_xfs_destroy_inode(ip);
 
@@ -943,6 +947,14 @@ xfs_fs_destroy_inode(
 	XFS_STATS_INC(ip->i_mount, vn_rele);
 	XFS_STATS_INC(ip->i_mount, vn_remove);
 
+	if (xfs_is_reflink_inode(ip)) {
+		error = xfs_reflink_cancel_cow_range(ip, 0, NULLFILEOFF);
+		if (error && !XFS_FORCED_SHUTDOWN(ip->i_mount))
+			xfs_warn(ip->i_mount,
+"Error %d while evicting CoW blocks for inode %llu.",
+					error, ip->i_ino);
+	}
+
 	xfs_inactive(ip);
 
 	ASSERT(XFS_FORCED_SHUTDOWN(ip->i_mount) || ip->i_delayed_blks == 0);
@@ -1006,6 +1018,16 @@ xfs_fs_drop_inode(
 {
 	struct xfs_inode	*ip = XFS_I(inode);
 
+	/*
+	 * If this unlinked inode is in the middle of recovery, don't
+	 * drop the inode just yet; log recovery will take care of
+	 * that.  See the comment for this inode flag.
+	 */
+	if (ip->i_flags & XFS_IRECOVERY) {
+		ASSERT(ip->i_mount->m_log->l_flags & XLOG_RECOVERY_NEEDED);
+		return 0;
+	}
+
 	return generic_drop_inode(inode) || (ip->i_flags & XFS_IDONTCACHE);
 }
 
@@ -1296,10 +1318,31 @@ xfs_fs_remount(
 		xfs_restore_resvblks(mp);
 		xfs_log_work_queue(mp);
 		xfs_queue_eofblocks(mp);
+
+		/* Recover any CoW blocks that never got remapped. */
+		error = xfs_reflink_recover_cow(mp);
+		if (error) {
+			xfs_err(mp,
+	"Error %d recovering leftover CoW allocations.", error);
+			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+			return error;
+		}
+
+		/* Create the per-AG metadata reservation pool .*/
+		error = xfs_fs_reserve_ag_blocks(mp);
+		if (error && error != -ENOSPC)
+			return error;
 	}
 
 	/* rw -> ro */
 	if (!(mp->m_flags & XFS_MOUNT_RDONLY) && (*flags & MS_RDONLY)) {
+		/* Free the per-AG metadata reservation pool. */
+		error = xfs_fs_unreserve_ag_blocks(mp);
+		if (error) {
+			xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
+			return error;
+		}
+
 		/*
 		 * Before we sync the metadata, we need to free up the reserve
 		 * block pool so that the used block count in the superblock on
@@ -1490,6 +1533,7 @@ xfs_fs_fill_super(
 	atomic_set(&mp->m_active_trans, 0);
 	INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker);
 	INIT_DELAYED_WORK(&mp->m_eofblocks_work, xfs_eofblocks_worker);
+	INIT_DELAYED_WORK(&mp->m_cowblocks_work, xfs_cowblocks_worker);
 	mp->m_kobj.kobject.kset = xfs_kset;
 
 	mp->m_super = sb;
@@ -1572,6 +1616,9 @@ xfs_fs_fill_super(
 			"DAX unsupported by block device. Turning off DAX.");
 			mp->m_flags &= ~XFS_MOUNT_DAX;
 		}
+		if (xfs_sb_version_hasreflink(&mp->m_sb))
+			xfs_alert(mp,
+		"DAX and reflink have not been tested together!");
 	}
 
 	if (xfs_sb_version_hasrmapbt(&mp->m_sb)) {
@@ -1585,6 +1632,10 @@ xfs_fs_fill_super(
 	"EXPERIMENTAL reverse mapping btree feature enabled. Use at your own risk!");
 	}
 
+	if (xfs_sb_version_hasreflink(&mp->m_sb))
+		xfs_alert(mp,
+	"EXPERIMENTAL reflink feature enabled. Use at your own risk!");
+
 	error = xfs_mountfs(mp);
 	if (error)
 		goto out_filestream_unmount;
@@ -1788,8 +1839,38 @@ xfs_init_zones(void)
 	if (!xfs_rui_zone)
 		goto out_destroy_rud_zone;
 
+	xfs_cud_zone = kmem_zone_init(sizeof(struct xfs_cud_log_item),
+			"xfs_cud_item");
+	if (!xfs_cud_zone)
+		goto out_destroy_rui_zone;
+
+	xfs_cui_zone = kmem_zone_init(
+			xfs_cui_log_item_sizeof(XFS_CUI_MAX_FAST_EXTENTS),
+			"xfs_cui_item");
+	if (!xfs_cui_zone)
+		goto out_destroy_cud_zone;
+
+	xfs_bud_zone = kmem_zone_init(sizeof(struct xfs_bud_log_item),
+			"xfs_bud_item");
+	if (!xfs_bud_zone)
+		goto out_destroy_cui_zone;
+
+	xfs_bui_zone = kmem_zone_init(
+			xfs_bui_log_item_sizeof(XFS_BUI_MAX_FAST_EXTENTS),
+			"xfs_bui_item");
+	if (!xfs_bui_zone)
+		goto out_destroy_bud_zone;
+
 	return 0;
 
+ out_destroy_bud_zone:
+	kmem_zone_destroy(xfs_bud_zone);
+ out_destroy_cui_zone:
+	kmem_zone_destroy(xfs_cui_zone);
+ out_destroy_cud_zone:
+	kmem_zone_destroy(xfs_cud_zone);
+ out_destroy_rui_zone:
+	kmem_zone_destroy(xfs_rui_zone);
  out_destroy_rud_zone:
 	kmem_zone_destroy(xfs_rud_zone);
  out_destroy_icreate_zone:
@@ -1832,6 +1913,10 @@ xfs_destroy_zones(void)
 	 * destroy caches.
 	 */
 	rcu_barrier();
+	kmem_zone_destroy(xfs_bui_zone);
+	kmem_zone_destroy(xfs_bud_zone);
+	kmem_zone_destroy(xfs_cui_zone);
+	kmem_zone_destroy(xfs_cud_zone);
 	kmem_zone_destroy(xfs_rui_zone);
 	kmem_zone_destroy(xfs_rud_zone);
 	kmem_zone_destroy(xfs_icreate_zone);
@@ -1885,6 +1970,8 @@ init_xfs_fs(void)
 
 	xfs_extent_free_init_defer_op();
 	xfs_rmap_update_init_defer_op();
+	xfs_refcount_update_init_defer_op();
+	xfs_bmap_update_init_defer_op();
 
 	xfs_dir_startup();
 
diff --git a/fs/xfs/xfs_sysctl.c b/fs/xfs/xfs_sysctl.c
index aed74d3f8da9..afe1f66aaa69 100644
--- a/fs/xfs/xfs_sysctl.c
+++ b/fs/xfs/xfs_sysctl.c
@@ -184,6 +184,15 @@ static struct ctl_table xfs_table[] = {
 		.extra1		= &xfs_params.eofb_timer.min,
 		.extra2		= &xfs_params.eofb_timer.max,
 	},
+	{
+		.procname	= "speculative_cow_prealloc_lifetime",
+		.data		= &xfs_params.cowb_timer.val,
+		.maxlen		= sizeof(int),
+		.mode		= 0644,
+		.proc_handler	= proc_dointvec_minmax,
+		.extra1		= &xfs_params.cowb_timer.min,
+		.extra2		= &xfs_params.cowb_timer.max,
+	},
 	/* please keep this the last entry */
 #ifdef CONFIG_PROC_FS
 	{
diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h
index ffef45375754..984a3499cfe3 100644
--- a/fs/xfs/xfs_sysctl.h
+++ b/fs/xfs/xfs_sysctl.h
@@ -48,6 +48,7 @@ typedef struct xfs_param {
 	xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */
 	xfs_sysctl_val_t fstrm_timer;	/* Filestream dir-AG assoc'n timeout. */
 	xfs_sysctl_val_t eofb_timer;	/* Interval between eofb scan wakeups */
+	xfs_sysctl_val_t cowb_timer;	/* Interval between cowb scan wakeups */
 } xfs_param_t;
 
 /*
diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h
index 16093c7dacde..ad188d3a83f3 100644
--- a/fs/xfs/xfs_trace.h
+++ b/fs/xfs/xfs_trace.h
@@ -39,6 +39,7 @@ struct xfs_buf_log_format;
 struct xfs_inode_log_format;
 struct xfs_bmbt_irec;
 struct xfs_btree_cur;
+struct xfs_refcount_irec;
 
 DECLARE_EVENT_CLASS(xfs_attr_list_class,
 	TP_PROTO(struct xfs_attr_list_context *ctx),
@@ -135,6 +136,8 @@ DEFINE_PERAG_REF_EVENT(xfs_perag_set_reclaim);
 DEFINE_PERAG_REF_EVENT(xfs_perag_clear_reclaim);
 DEFINE_PERAG_REF_EVENT(xfs_perag_set_eofblocks);
 DEFINE_PERAG_REF_EVENT(xfs_perag_clear_eofblocks);
+DEFINE_PERAG_REF_EVENT(xfs_perag_set_cowblocks);
+DEFINE_PERAG_REF_EVENT(xfs_perag_clear_cowblocks);
 
 DECLARE_EVENT_CLASS(xfs_ag_class,
 	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno),
@@ -268,10 +271,10 @@ DECLARE_EVENT_CLASS(xfs_bmap_class,
 		__field(unsigned long, caller_ip)
 	),
 	TP_fast_assign(
-		struct xfs_ifork	*ifp = (state & BMAP_ATTRFORK) ?
-						ip->i_afp : &ip->i_df;
+		struct xfs_ifork	*ifp;
 		struct xfs_bmbt_irec	r;
 
+		ifp = xfs_iext_state_to_fork(ip, state);
 		xfs_bmbt_get_all(xfs_iext_get_ext(ifp, idx), &r);
 		__entry->dev = VFS_I(ip)->i_sb->s_dev;
 		__entry->ino = ip->i_ino;
@@ -686,6 +689,9 @@ DEFINE_INODE_EVENT(xfs_dquot_dqdetach);
 DEFINE_INODE_EVENT(xfs_inode_set_eofblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_clear_eofblocks_tag);
 DEFINE_INODE_EVENT(xfs_inode_free_eofblocks_invalid);
+DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
+DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
 
 DEFINE_INODE_EVENT(xfs_filemap_fault);
 DEFINE_INODE_EVENT(xfs_filemap_pmd_fault);
@@ -2581,10 +2587,20 @@ DEFINE_RMAPBT_EVENT(xfs_rmap_delete);
 DEFINE_AG_ERROR_EVENT(xfs_rmap_insert_error);
 DEFINE_AG_ERROR_EVENT(xfs_rmap_delete_error);
 DEFINE_AG_ERROR_EVENT(xfs_rmap_update_error);
+
+DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_candidate);
+DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_query);
+DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_candidate);
+DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range);
 DEFINE_RMAPBT_EVENT(xfs_rmap_lookup_le_range_result);
 DEFINE_RMAPBT_EVENT(xfs_rmap_find_right_neighbor_result);
 DEFINE_RMAPBT_EVENT(xfs_rmap_find_left_neighbor_result);
 
+/* deferred bmbt updates */
+#define DEFINE_BMAP_DEFERRED_EVENT	DEFINE_RMAP_DEFERRED_EVENT
+DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_defer);
+DEFINE_BMAP_DEFERRED_EVENT(xfs_bmap_deferred);
+
 /* per-AG reservation */
 DECLARE_EVENT_CLASS(xfs_ag_resv_class,
 	TP_PROTO(struct xfs_perag *pag, enum xfs_ag_resv_type resv,
@@ -2639,6 +2655,728 @@ DEFINE_AG_RESV_EVENT(xfs_ag_resv_needed);
 DEFINE_AG_ERROR_EVENT(xfs_ag_resv_free_error);
 DEFINE_AG_ERROR_EVENT(xfs_ag_resv_init_error);
 
+/* refcount tracepoint classes */
+
+/* reuse the discard trace class for agbno/aglen-based traces */
+#define DEFINE_AG_EXTENT_EVENT(name) DEFINE_DISCARD_EVENT(name)
+
+/* ag btree lookup tracepoint class */
+#define XFS_AG_BTREE_CMP_FORMAT_STR \
+	{ XFS_LOOKUP_EQ,	"eq" }, \
+	{ XFS_LOOKUP_LE,	"le" }, \
+	{ XFS_LOOKUP_GE,	"ge" }
+DECLARE_EVENT_CLASS(xfs_ag_btree_lookup_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 xfs_agblock_t agbno, xfs_lookup_t dir),
+	TP_ARGS(mp, agno, agbno, dir),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_lookup_t, dir)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->agbno = agbno;
+		__entry->dir = dir;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u cmp %s(%d)\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->agbno,
+		  __print_symbolic(__entry->dir, XFS_AG_BTREE_CMP_FORMAT_STR),
+		  __entry->dir)
+)
+
+#define DEFINE_AG_BTREE_LOOKUP_EVENT(name) \
+DEFINE_EVENT(xfs_ag_btree_lookup_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 xfs_agblock_t agbno, xfs_lookup_t dir), \
+	TP_ARGS(mp, agno, agbno, dir))
+
+/* single-rcext tracepoint class */
+DECLARE_EVENT_CLASS(xfs_refcount_extent_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 struct xfs_refcount_irec *irec),
+	TP_ARGS(mp, agno, irec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, startblock)
+		__field(xfs_extlen_t, blockcount)
+		__field(xfs_nlink_t, refcount)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->startblock = irec->rc_startblock;
+		__entry->blockcount = irec->rc_blockcount;
+		__entry->refcount = irec->rc_refcount;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->startblock,
+		  __entry->blockcount,
+		  __entry->refcount)
+)
+
+#define DEFINE_REFCOUNT_EXTENT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_extent_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 struct xfs_refcount_irec *irec), \
+	TP_ARGS(mp, agno, irec))
+
+/* single-rcext and an agbno tracepoint class */
+DECLARE_EVENT_CLASS(xfs_refcount_extent_at_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 struct xfs_refcount_irec *irec, xfs_agblock_t agbno),
+	TP_ARGS(mp, agno, irec, agbno),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, startblock)
+		__field(xfs_extlen_t, blockcount)
+		__field(xfs_nlink_t, refcount)
+		__field(xfs_agblock_t, agbno)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->startblock = irec->rc_startblock;
+		__entry->blockcount = irec->rc_blockcount;
+		__entry->refcount = irec->rc_refcount;
+		__entry->agbno = agbno;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u @ agbno %u\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->startblock,
+		  __entry->blockcount,
+		  __entry->refcount,
+		  __entry->agbno)
+)
+
+#define DEFINE_REFCOUNT_EXTENT_AT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_extent_at_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 struct xfs_refcount_irec *irec, xfs_agblock_t agbno), \
+	TP_ARGS(mp, agno, irec, agbno))
+
+/* double-rcext tracepoint class */
+DECLARE_EVENT_CLASS(xfs_refcount_double_extent_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2),
+	TP_ARGS(mp, agno, i1, i2),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, i1_startblock)
+		__field(xfs_extlen_t, i1_blockcount)
+		__field(xfs_nlink_t, i1_refcount)
+		__field(xfs_agblock_t, i2_startblock)
+		__field(xfs_extlen_t, i2_blockcount)
+		__field(xfs_nlink_t, i2_refcount)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->i1_startblock = i1->rc_startblock;
+		__entry->i1_blockcount = i1->rc_blockcount;
+		__entry->i1_refcount = i1->rc_refcount;
+		__entry->i2_startblock = i2->rc_startblock;
+		__entry->i2_blockcount = i2->rc_blockcount;
+		__entry->i2_refcount = i2->rc_refcount;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
+		  "agbno %u len %u refcount %u\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->i1_startblock,
+		  __entry->i1_blockcount,
+		  __entry->i1_refcount,
+		  __entry->i2_startblock,
+		  __entry->i2_blockcount,
+		  __entry->i2_refcount)
+)
+
+#define DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_double_extent_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2), \
+	TP_ARGS(mp, agno, i1, i2))
+
+/* double-rcext and an agbno tracepoint class */
+DECLARE_EVENT_CLASS(xfs_refcount_double_extent_at_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2,
+		 xfs_agblock_t agbno),
+	TP_ARGS(mp, agno, i1, i2, agbno),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, i1_startblock)
+		__field(xfs_extlen_t, i1_blockcount)
+		__field(xfs_nlink_t, i1_refcount)
+		__field(xfs_agblock_t, i2_startblock)
+		__field(xfs_extlen_t, i2_blockcount)
+		__field(xfs_nlink_t, i2_refcount)
+		__field(xfs_agblock_t, agbno)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->i1_startblock = i1->rc_startblock;
+		__entry->i1_blockcount = i1->rc_blockcount;
+		__entry->i1_refcount = i1->rc_refcount;
+		__entry->i2_startblock = i2->rc_startblock;
+		__entry->i2_blockcount = i2->rc_blockcount;
+		__entry->i2_refcount = i2->rc_refcount;
+		__entry->agbno = agbno;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
+		  "agbno %u len %u refcount %u @ agbno %u\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->i1_startblock,
+		  __entry->i1_blockcount,
+		  __entry->i1_refcount,
+		  __entry->i2_startblock,
+		  __entry->i2_blockcount,
+		  __entry->i2_refcount,
+		  __entry->agbno)
+)
+
+#define DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_double_extent_at_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \
+		 xfs_agblock_t agbno), \
+	TP_ARGS(mp, agno, i1, i2, agbno))
+
+/* triple-rcext tracepoint class */
+DECLARE_EVENT_CLASS(xfs_refcount_triple_extent_class,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2,
+		 struct xfs_refcount_irec *i3),
+	TP_ARGS(mp, agno, i1, i2, i3),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(xfs_agblock_t, i1_startblock)
+		__field(xfs_extlen_t, i1_blockcount)
+		__field(xfs_nlink_t, i1_refcount)
+		__field(xfs_agblock_t, i2_startblock)
+		__field(xfs_extlen_t, i2_blockcount)
+		__field(xfs_nlink_t, i2_refcount)
+		__field(xfs_agblock_t, i3_startblock)
+		__field(xfs_extlen_t, i3_blockcount)
+		__field(xfs_nlink_t, i3_refcount)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->i1_startblock = i1->rc_startblock;
+		__entry->i1_blockcount = i1->rc_blockcount;
+		__entry->i1_refcount = i1->rc_refcount;
+		__entry->i2_startblock = i2->rc_startblock;
+		__entry->i2_blockcount = i2->rc_blockcount;
+		__entry->i2_refcount = i2->rc_refcount;
+		__entry->i3_startblock = i3->rc_startblock;
+		__entry->i3_blockcount = i3->rc_blockcount;
+		__entry->i3_refcount = i3->rc_refcount;
+	),
+	TP_printk("dev %d:%d agno %u agbno %u len %u refcount %u -- "
+		  "agbno %u len %u refcount %u -- "
+		  "agbno %u len %u refcount %u\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->agno,
+		  __entry->i1_startblock,
+		  __entry->i1_blockcount,
+		  __entry->i1_refcount,
+		  __entry->i2_startblock,
+		  __entry->i2_blockcount,
+		  __entry->i2_refcount,
+		  __entry->i3_startblock,
+		  __entry->i3_blockcount,
+		  __entry->i3_refcount)
+);
+
+#define DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(name) \
+DEFINE_EVENT(xfs_refcount_triple_extent_class, name, \
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno, \
+		 struct xfs_refcount_irec *i1, struct xfs_refcount_irec *i2, \
+		 struct xfs_refcount_irec *i3), \
+	TP_ARGS(mp, agno, i1, i2, i3))
+
+/* refcount btree tracepoints */
+DEFINE_BUSY_EVENT(xfs_refcountbt_alloc_block);
+DEFINE_BUSY_EVENT(xfs_refcountbt_free_block);
+DEFINE_AG_BTREE_LOOKUP_EVENT(xfs_refcount_lookup);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_get);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_update);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_insert);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_delete);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_insert_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_delete_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_update_error);
+
+/* refcount adjustment tracepoints */
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_increase);
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_decrease);
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_increase);
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_cow_decrease);
+DEFINE_REFCOUNT_TRIPLE_EXTENT_EVENT(xfs_refcount_merge_center_extents);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_modify_extent);
+DEFINE_REFCOUNT_EXTENT_EVENT(xfs_refcount_recover_extent);
+DEFINE_REFCOUNT_EXTENT_AT_EVENT(xfs_refcount_split_extent);
+DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_left_extent);
+DEFINE_REFCOUNT_DOUBLE_EXTENT_EVENT(xfs_refcount_merge_right_extent);
+DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_left_extent);
+DEFINE_REFCOUNT_DOUBLE_EXTENT_AT_EVENT(xfs_refcount_find_right_extent);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_adjust_cow_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_center_extents_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_modify_extent_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_split_extent_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_left_extent_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_merge_right_extent_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_find_left_extent_error);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_find_right_extent_error);
+
+/* reflink helpers */
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared);
+DEFINE_AG_EXTENT_EVENT(xfs_refcount_find_shared_result);
+DEFINE_AG_ERROR_EVENT(xfs_refcount_find_shared_error);
+#define DEFINE_REFCOUNT_DEFERRED_EVENT DEFINE_PHYS_EXTENT_DEFERRED_EVENT
+DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_defer);
+DEFINE_REFCOUNT_DEFERRED_EVENT(xfs_refcount_deferred);
+
+TRACE_EVENT(xfs_refcount_finish_one_leftover,
+	TP_PROTO(struct xfs_mount *mp, xfs_agnumber_t agno,
+		 int type, xfs_agblock_t agbno, xfs_extlen_t len,
+		 xfs_agblock_t new_agbno, xfs_extlen_t new_len),
+	TP_ARGS(mp, agno, type, agbno, len, new_agbno, new_len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_agnumber_t, agno)
+		__field(int, type)
+		__field(xfs_agblock_t, agbno)
+		__field(xfs_extlen_t, len)
+		__field(xfs_agblock_t, new_agbno)
+		__field(xfs_extlen_t, new_len)
+	),
+	TP_fast_assign(
+		__entry->dev = mp->m_super->s_dev;
+		__entry->agno = agno;
+		__entry->type = type;
+		__entry->agbno = agbno;
+		__entry->len = len;
+		__entry->new_agbno = new_agbno;
+		__entry->new_len = new_len;
+	),
+	TP_printk("dev %d:%d type %d agno %u agbno %u len %u new_agbno %u new_len %u",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->type,
+		  __entry->agno,
+		  __entry->agbno,
+		  __entry->len,
+		  __entry->new_agbno,
+		  __entry->new_len)
+);
+
+/* simple inode-based error/%ip tracepoint class */
+DECLARE_EVENT_CLASS(xfs_inode_error_class,
+	TP_PROTO(struct xfs_inode *ip, int error, unsigned long caller_ip),
+	TP_ARGS(ip, error, caller_ip),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(int, error)
+		__field(unsigned long, caller_ip)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->error = error;
+		__entry->caller_ip = caller_ip;
+	),
+	TP_printk("dev %d:%d ino %llx error %d caller %ps",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->error,
+		  (char *)__entry->caller_ip)
+);
+
+#define DEFINE_INODE_ERROR_EVENT(name) \
+DEFINE_EVENT(xfs_inode_error_class, name, \
+	TP_PROTO(struct xfs_inode *ip, int error, \
+		 unsigned long caller_ip), \
+	TP_ARGS(ip, error, caller_ip))
+
+/* reflink allocator */
+TRACE_EVENT(xfs_bmap_remap_alloc,
+	TP_PROTO(struct xfs_inode *ip, xfs_fsblock_t fsbno,
+		 xfs_extlen_t len),
+	TP_ARGS(ip, fsbno, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fsblock_t, fsbno)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->fsbno = fsbno;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d ino 0x%llx fsbno 0x%llx len %x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->fsbno,
+		  __entry->len)
+);
+DEFINE_INODE_ERROR_EVENT(xfs_bmap_remap_alloc_error);
+
+/* reflink tracepoint classes */
+
+/* two-file io tracepoint class */
+DECLARE_EVENT_CLASS(xfs_double_io_class,
+	TP_PROTO(struct xfs_inode *src, xfs_off_t soffset, xfs_off_t len,
+		 struct xfs_inode *dest, xfs_off_t doffset),
+	TP_ARGS(src, soffset, len, dest, doffset),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, src_ino)
+		__field(loff_t, src_isize)
+		__field(loff_t, src_disize)
+		__field(loff_t, src_offset)
+		__field(size_t, len)
+		__field(xfs_ino_t, dest_ino)
+		__field(loff_t, dest_isize)
+		__field(loff_t, dest_disize)
+		__field(loff_t, dest_offset)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(src)->i_sb->s_dev;
+		__entry->src_ino = src->i_ino;
+		__entry->src_isize = VFS_I(src)->i_size;
+		__entry->src_disize = src->i_d.di_size;
+		__entry->src_offset = soffset;
+		__entry->len = len;
+		__entry->dest_ino = dest->i_ino;
+		__entry->dest_isize = VFS_I(dest)->i_size;
+		__entry->dest_disize = dest->i_d.di_size;
+		__entry->dest_offset = doffset;
+	),
+	TP_printk("dev %d:%d count %zd "
+		  "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx -> "
+		  "ino 0x%llx isize 0x%llx disize 0x%llx offset 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->len,
+		  __entry->src_ino,
+		  __entry->src_isize,
+		  __entry->src_disize,
+		  __entry->src_offset,
+		  __entry->dest_ino,
+		  __entry->dest_isize,
+		  __entry->dest_disize,
+		  __entry->dest_offset)
+)
+
+#define DEFINE_DOUBLE_IO_EVENT(name)	\
+DEFINE_EVENT(xfs_double_io_class, name,	\
+	TP_PROTO(struct xfs_inode *src, xfs_off_t soffset, xfs_off_t len, \
+		 struct xfs_inode *dest, xfs_off_t doffset), \
+	TP_ARGS(src, soffset, len, dest, doffset))
+
+/* two-file vfs io tracepoint class */
+DECLARE_EVENT_CLASS(xfs_double_vfs_io_class,
+	TP_PROTO(struct inode *src, u64 soffset, u64 len,
+		 struct inode *dest, u64 doffset),
+	TP_ARGS(src, soffset, len, dest, doffset),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned long, src_ino)
+		__field(loff_t, src_isize)
+		__field(loff_t, src_offset)
+		__field(size_t, len)
+		__field(unsigned long, dest_ino)
+		__field(loff_t, dest_isize)
+		__field(loff_t, dest_offset)
+	),
+	TP_fast_assign(
+		__entry->dev = src->i_sb->s_dev;
+		__entry->src_ino = src->i_ino;
+		__entry->src_isize = i_size_read(src);
+		__entry->src_offset = soffset;
+		__entry->len = len;
+		__entry->dest_ino = dest->i_ino;
+		__entry->dest_isize = i_size_read(dest);
+		__entry->dest_offset = doffset;
+	),
+	TP_printk("dev %d:%d count %zd "
+		  "ino 0x%lx isize 0x%llx offset 0x%llx -> "
+		  "ino 0x%lx isize 0x%llx offset 0x%llx",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->len,
+		  __entry->src_ino,
+		  __entry->src_isize,
+		  __entry->src_offset,
+		  __entry->dest_ino,
+		  __entry->dest_isize,
+		  __entry->dest_offset)
+)
+
+#define DEFINE_DOUBLE_VFS_IO_EVENT(name)	\
+DEFINE_EVENT(xfs_double_vfs_io_class, name,	\
+	TP_PROTO(struct inode *src, u64 soffset, u64 len, \
+		 struct inode *dest, u64 doffset), \
+	TP_ARGS(src, soffset, len, dest, doffset))
+
+/* CoW write tracepoint */
+DECLARE_EVENT_CLASS(xfs_copy_on_write_class,
+	TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk,
+		 xfs_extlen_t len, xfs_fsblock_t new_pblk),
+	TP_ARGS(ip, lblk, pblk, len, new_pblk),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fileoff_t, lblk)
+		__field(xfs_fsblock_t, pblk)
+		__field(xfs_extlen_t, len)
+		__field(xfs_fsblock_t, new_pblk)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->lblk = lblk;
+		__entry->pblk = pblk;
+		__entry->len = len;
+		__entry->new_pblk = new_pblk;
+	),
+	TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx pblk 0x%llx "
+		  "len 0x%x new_pblk %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->lblk,
+		  __entry->pblk,
+		  __entry->len,
+		  __entry->new_pblk)
+)
+
+#define DEFINE_COW_EVENT(name)	\
+DEFINE_EVENT(xfs_copy_on_write_class, name,	\
+	TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk, xfs_fsblock_t pblk, \
+		 xfs_extlen_t len, xfs_fsblock_t new_pblk), \
+	TP_ARGS(ip, lblk, pblk, len, new_pblk))
+
+/* inode/irec events */
+DECLARE_EVENT_CLASS(xfs_inode_irec_class,
+	TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec),
+	TP_ARGS(ip, irec),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fileoff_t, lblk)
+		__field(xfs_extlen_t, len)
+		__field(xfs_fsblock_t, pblk)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->lblk = irec->br_startoff;
+		__entry->len = irec->br_blockcount;
+		__entry->pblk = irec->br_startblock;
+	),
+	TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x pblk %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->lblk,
+		  __entry->len,
+		  __entry->pblk)
+);
+#define DEFINE_INODE_IREC_EVENT(name) \
+DEFINE_EVENT(xfs_inode_irec_class, name, \
+	TP_PROTO(struct xfs_inode *ip, struct xfs_bmbt_irec *irec), \
+	TP_ARGS(ip, irec))
+
+/* refcount/reflink tracepoint definitions */
+
+/* reflink tracepoints */
+DEFINE_INODE_EVENT(xfs_reflink_set_inode_flag);
+DEFINE_INODE_EVENT(xfs_reflink_unset_inode_flag);
+DEFINE_ITRUNC_EVENT(xfs_reflink_update_inode_size);
+DEFINE_IOMAP_EVENT(xfs_reflink_remap_imap);
+TRACE_EVENT(xfs_reflink_remap_blocks_loop,
+	TP_PROTO(struct xfs_inode *src, xfs_fileoff_t soffset,
+		 xfs_filblks_t len, struct xfs_inode *dest,
+		 xfs_fileoff_t doffset),
+	TP_ARGS(src, soffset, len, dest, doffset),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, src_ino)
+		__field(xfs_fileoff_t, src_lblk)
+		__field(xfs_filblks_t, len)
+		__field(xfs_ino_t, dest_ino)
+		__field(xfs_fileoff_t, dest_lblk)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(src)->i_sb->s_dev;
+		__entry->src_ino = src->i_ino;
+		__entry->src_lblk = soffset;
+		__entry->len = len;
+		__entry->dest_ino = dest->i_ino;
+		__entry->dest_lblk = doffset;
+	),
+	TP_printk("dev %d:%d len 0x%llx "
+		  "ino 0x%llx offset 0x%llx blocks -> "
+		  "ino 0x%llx offset 0x%llx blocks",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->len,
+		  __entry->src_ino,
+		  __entry->src_lblk,
+		  __entry->dest_ino,
+		  __entry->dest_lblk)
+);
+TRACE_EVENT(xfs_reflink_punch_range,
+	TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk,
+		 xfs_extlen_t len),
+	TP_ARGS(ip, lblk, len),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fileoff_t, lblk)
+		__field(xfs_extlen_t, len)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->lblk = lblk;
+		__entry->len = len;
+	),
+	TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->lblk,
+		  __entry->len)
+);
+TRACE_EVENT(xfs_reflink_remap,
+	TP_PROTO(struct xfs_inode *ip, xfs_fileoff_t lblk,
+		 xfs_extlen_t len, xfs_fsblock_t new_pblk),
+	TP_ARGS(ip, lblk, len, new_pblk),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(xfs_ino_t, ino)
+		__field(xfs_fileoff_t, lblk)
+		__field(xfs_extlen_t, len)
+		__field(xfs_fsblock_t, new_pblk)
+	),
+	TP_fast_assign(
+		__entry->dev = VFS_I(ip)->i_sb->s_dev;
+		__entry->ino = ip->i_ino;
+		__entry->lblk = lblk;
+		__entry->len = len;
+		__entry->new_pblk = new_pblk;
+	),
+	TP_printk("dev %d:%d ino 0x%llx lblk 0x%llx len 0x%x new_pblk %llu",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->ino,
+		  __entry->lblk,
+		  __entry->len,
+		  __entry->new_pblk)
+);
+DEFINE_DOUBLE_IO_EVENT(xfs_reflink_remap_range);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_range_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_set_inode_flag_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_update_inode_size_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_reflink_main_loop_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_read_iomap_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_blocks_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_remap_extent_error);
+
+/* dedupe tracepoints */
+DEFINE_DOUBLE_IO_EVENT(xfs_reflink_compare_extents);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_compare_extents_error);
+
+/* ioctl tracepoints */
+DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_reflink);
+DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_clone_range);
+DEFINE_DOUBLE_VFS_IO_EVENT(xfs_ioctl_file_extent_same);
+TRACE_EVENT(xfs_ioctl_clone,
+	TP_PROTO(struct inode *src, struct inode *dest),
+	TP_ARGS(src, dest),
+	TP_STRUCT__entry(
+		__field(dev_t, dev)
+		__field(unsigned long, src_ino)
+		__field(loff_t, src_isize)
+		__field(unsigned long, dest_ino)
+		__field(loff_t, dest_isize)
+	),
+	TP_fast_assign(
+		__entry->dev = src->i_sb->s_dev;
+		__entry->src_ino = src->i_ino;
+		__entry->src_isize = i_size_read(src);
+		__entry->dest_ino = dest->i_ino;
+		__entry->dest_isize = i_size_read(dest);
+	),
+	TP_printk("dev %d:%d "
+		  "ino 0x%lx isize 0x%llx -> "
+		  "ino 0x%lx isize 0x%llx\n",
+		  MAJOR(__entry->dev), MINOR(__entry->dev),
+		  __entry->src_ino,
+		  __entry->src_isize,
+		  __entry->dest_ino,
+		  __entry->dest_isize)
+);
+
+/* unshare tracepoints */
+DEFINE_SIMPLE_IO_EVENT(xfs_reflink_unshare);
+DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cow_eof_block);
+DEFINE_PAGE_EVENT(xfs_reflink_unshare_page);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_unshare_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_cow_eof_block_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_dirty_page_error);
+
+/* copy on write */
+DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_around_shared);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_alloc);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_found);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_enospc);
+
+DEFINE_RW_EVENT(xfs_reflink_reserve_cow_range);
+DEFINE_RW_EVENT(xfs_reflink_allocate_cow_range);
+
+DEFINE_INODE_IREC_EVENT(xfs_reflink_bounce_dio_write);
+DEFINE_IOMAP_EVENT(xfs_reflink_find_cow_mapping);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_trim_irec);
+
+DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range);
+DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_piece);
+
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_reserve_cow_range_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_allocate_cow_range_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error);
+
+DEFINE_COW_EVENT(xfs_reflink_fork_buf);
+DEFINE_COW_EVENT(xfs_reflink_finish_fork_buf);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_fork_buf_error);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_finish_fork_buf_error);
+
+DEFINE_INODE_EVENT(xfs_reflink_cancel_pending_cow);
+DEFINE_INODE_IREC_EVENT(xfs_reflink_cancel_cow);
+DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_pending_cow_error);
+
+/* rmap swapext tracepoints */
+DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap);
+DEFINE_INODE_IREC_EVENT(xfs_swap_extent_rmap_remap_piece);
+DEFINE_INODE_ERROR_EVENT(xfs_swap_extent_rmap_error);
+
 #endif /* _TRACE_XFS_H */
 
 #undef TRACE_INCLUDE_PATH
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index e2bf86aad33d..61b7fbdd3ebd 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -36,6 +36,11 @@ struct xfs_busy_extent;
 struct xfs_rud_log_item;
 struct xfs_rui_log_item;
 struct xfs_btree_cur;
+struct xfs_cui_log_item;
+struct xfs_cud_log_item;
+struct xfs_defer_ops;
+struct xfs_bui_log_item;
+struct xfs_bud_log_item;
 
 typedef struct xfs_log_item {
 	struct list_head		li_ail;		/* AIL pointers */
@@ -248,4 +253,28 @@ int xfs_trans_log_finish_rmap_update(struct xfs_trans *tp,
 		xfs_fsblock_t startblock, xfs_filblks_t blockcount,
 		xfs_exntst_t state, struct xfs_btree_cur **pcur);
 
+/* refcount updates */
+enum xfs_refcount_intent_type;
+
+void xfs_refcount_update_init_defer_op(void);
+struct xfs_cud_log_item *xfs_trans_get_cud(struct xfs_trans *tp,
+		struct xfs_cui_log_item *cuip);
+int xfs_trans_log_finish_refcount_update(struct xfs_trans *tp,
+		struct xfs_cud_log_item *cudp, struct xfs_defer_ops *dfops,
+		enum xfs_refcount_intent_type type, xfs_fsblock_t startblock,
+		xfs_extlen_t blockcount, xfs_fsblock_t *new_fsb,
+		xfs_extlen_t *new_len, struct xfs_btree_cur **pcur);
+
+/* mapping updates */
+enum xfs_bmap_intent_type;
+
+void xfs_bmap_update_init_defer_op(void);
+struct xfs_bud_log_item *xfs_trans_get_bud(struct xfs_trans *tp,
+		struct xfs_bui_log_item *buip);
+int xfs_trans_log_finish_bmap_update(struct xfs_trans *tp,
+		struct xfs_bud_log_item *rudp, struct xfs_defer_ops *dfops,
+		enum xfs_bmap_intent_type type, struct xfs_inode *ip,
+		int whichfork, xfs_fileoff_t startoff, xfs_fsblock_t startblock,
+		xfs_filblks_t blockcount, xfs_exntst_t state);
+
 #endif	/* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_bmap.c b/fs/xfs/xfs_trans_bmap.c
new file mode 100644
index 000000000000..6408e7d7c08c
--- /dev/null
+++ b/fs/xfs/xfs_trans_bmap.c
@@ -0,0 +1,249 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_bmap_item.h"
+#include "xfs_alloc.h"
+#include "xfs_bmap.h"
+#include "xfs_inode.h"
+
+/*
+ * This routine is called to allocate a "bmap update done"
+ * log item.
+ */
+struct xfs_bud_log_item *
+xfs_trans_get_bud(
+	struct xfs_trans		*tp,
+	struct xfs_bui_log_item		*buip)
+{
+	struct xfs_bud_log_item		*budp;
+
+	budp = xfs_bud_init(tp->t_mountp, buip);
+	xfs_trans_add_item(tp, &budp->bud_item);
+	return budp;
+}
+
+/*
+ * Finish an bmap update and log it to the BUD. Note that the
+ * transaction is marked dirty regardless of whether the bmap update
+ * succeeds or fails to support the BUI/BUD lifecycle rules.
+ */
+int
+xfs_trans_log_finish_bmap_update(
+	struct xfs_trans		*tp,
+	struct xfs_bud_log_item		*budp,
+	struct xfs_defer_ops		*dop,
+	enum xfs_bmap_intent_type	type,
+	struct xfs_inode		*ip,
+	int				whichfork,
+	xfs_fileoff_t			startoff,
+	xfs_fsblock_t			startblock,
+	xfs_filblks_t			blockcount,
+	xfs_exntst_t			state)
+{
+	int				error;
+
+	error = xfs_bmap_finish_one(tp, dop, ip, type, whichfork, startoff,
+			startblock, blockcount, state);
+
+	/*
+	 * Mark the transaction dirty, even on error. This ensures the
+	 * transaction is aborted, which:
+	 *
+	 * 1.) releases the BUI and frees the BUD
+	 * 2.) shuts down the filesystem
+	 */
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	budp->bud_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+	return error;
+}
+
+/* Sort bmap intents by inode. */
+static int
+xfs_bmap_update_diff_items(
+	void				*priv,
+	struct list_head		*a,
+	struct list_head		*b)
+{
+	struct xfs_bmap_intent		*ba;
+	struct xfs_bmap_intent		*bb;
+
+	ba = container_of(a, struct xfs_bmap_intent, bi_list);
+	bb = container_of(b, struct xfs_bmap_intent, bi_list);
+	return ba->bi_owner->i_ino - bb->bi_owner->i_ino;
+}
+
+/* Get an BUI. */
+STATIC void *
+xfs_bmap_update_create_intent(
+	struct xfs_trans		*tp,
+	unsigned int			count)
+{
+	struct xfs_bui_log_item		*buip;
+
+	ASSERT(count == XFS_BUI_MAX_FAST_EXTENTS);
+	ASSERT(tp != NULL);
+
+	buip = xfs_bui_init(tp->t_mountp);
+	ASSERT(buip != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &buip->bui_item);
+	return buip;
+}
+
+/* Set the map extent flags for this mapping. */
+static void
+xfs_trans_set_bmap_flags(
+	struct xfs_map_extent		*bmap,
+	enum xfs_bmap_intent_type	type,
+	int				whichfork,
+	xfs_exntst_t			state)
+{
+	bmap->me_flags = 0;
+	switch (type) {
+	case XFS_BMAP_MAP:
+	case XFS_BMAP_UNMAP:
+		bmap->me_flags = type;
+		break;
+	default:
+		ASSERT(0);
+	}
+	if (state == XFS_EXT_UNWRITTEN)
+		bmap->me_flags |= XFS_BMAP_EXTENT_UNWRITTEN;
+	if (whichfork == XFS_ATTR_FORK)
+		bmap->me_flags |= XFS_BMAP_EXTENT_ATTR_FORK;
+}
+
+/* Log bmap updates in the intent item. */
+STATIC void
+xfs_bmap_update_log_item(
+	struct xfs_trans		*tp,
+	void				*intent,
+	struct list_head		*item)
+{
+	struct xfs_bui_log_item		*buip = intent;
+	struct xfs_bmap_intent		*bmap;
+	uint				next_extent;
+	struct xfs_map_extent		*map;
+
+	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	buip->bui_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+	/*
+	 * atomic_inc_return gives us the value after the increment;
+	 * we want to use it as an array index so we need to subtract 1 from
+	 * it.
+	 */
+	next_extent = atomic_inc_return(&buip->bui_next_extent) - 1;
+	ASSERT(next_extent < buip->bui_format.bui_nextents);
+	map = &buip->bui_format.bui_extents[next_extent];
+	map->me_owner = bmap->bi_owner->i_ino;
+	map->me_startblock = bmap->bi_bmap.br_startblock;
+	map->me_startoff = bmap->bi_bmap.br_startoff;
+	map->me_len = bmap->bi_bmap.br_blockcount;
+	xfs_trans_set_bmap_flags(map, bmap->bi_type, bmap->bi_whichfork,
+			bmap->bi_bmap.br_state);
+}
+
+/* Get an BUD so we can process all the deferred rmap updates. */
+STATIC void *
+xfs_bmap_update_create_done(
+	struct xfs_trans		*tp,
+	void				*intent,
+	unsigned int			count)
+{
+	return xfs_trans_get_bud(tp, intent);
+}
+
+/* Process a deferred rmap update. */
+STATIC int
+xfs_bmap_update_finish_item(
+	struct xfs_trans		*tp,
+	struct xfs_defer_ops		*dop,
+	struct list_head		*item,
+	void				*done_item,
+	void				**state)
+{
+	struct xfs_bmap_intent		*bmap;
+	int				error;
+
+	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+	error = xfs_trans_log_finish_bmap_update(tp, done_item, dop,
+			bmap->bi_type,
+			bmap->bi_owner, bmap->bi_whichfork,
+			bmap->bi_bmap.br_startoff,
+			bmap->bi_bmap.br_startblock,
+			bmap->bi_bmap.br_blockcount,
+			bmap->bi_bmap.br_state);
+	kmem_free(bmap);
+	return error;
+}
+
+/* Abort all pending BUIs. */
+STATIC void
+xfs_bmap_update_abort_intent(
+	void				*intent)
+{
+	xfs_bui_release(intent);
+}
+
+/* Cancel a deferred rmap update. */
+STATIC void
+xfs_bmap_update_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_bmap_intent		*bmap;
+
+	bmap = container_of(item, struct xfs_bmap_intent, bi_list);
+	kmem_free(bmap);
+}
+
+static const struct xfs_defer_op_type xfs_bmap_update_defer_type = {
+	.type		= XFS_DEFER_OPS_TYPE_BMAP,
+	.max_items	= XFS_BUI_MAX_FAST_EXTENTS,
+	.diff_items	= xfs_bmap_update_diff_items,
+	.create_intent	= xfs_bmap_update_create_intent,
+	.abort_intent	= xfs_bmap_update_abort_intent,
+	.log_item	= xfs_bmap_update_log_item,
+	.create_done	= xfs_bmap_update_create_done,
+	.finish_item	= xfs_bmap_update_finish_item,
+	.cancel_item	= xfs_bmap_update_cancel_item,
+};
+
+/* Register the deferred op type. */
+void
+xfs_bmap_update_init_defer_op(void)
+{
+	xfs_defer_init_op_type(&xfs_bmap_update_defer_type);
+}
diff --git a/fs/xfs/xfs_trans_refcount.c b/fs/xfs/xfs_trans_refcount.c
new file mode 100644
index 000000000000..94c1877af834
--- /dev/null
+++ b/fs/xfs/xfs_trans_refcount.c
@@ -0,0 +1,264 @@
+/*
+ * Copyright (C) 2016 Oracle.  All Rights Reserved.
+ *
+ * Author: Darrick J. Wong <darrick.wong@oracle.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it would be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, write the Free Software Foundation,
+ * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301, USA.
+ */
+#include "xfs.h"
+#include "xfs_fs.h"
+#include "xfs_shared.h"
+#include "xfs_format.h"
+#include "xfs_log_format.h"
+#include "xfs_trans_resv.h"
+#include "xfs_mount.h"
+#include "xfs_defer.h"
+#include "xfs_trans.h"
+#include "xfs_trans_priv.h"
+#include "xfs_refcount_item.h"
+#include "xfs_alloc.h"
+#include "xfs_refcount.h"
+
+/*
+ * This routine is called to allocate a "refcount update done"
+ * log item.
+ */
+struct xfs_cud_log_item *
+xfs_trans_get_cud(
+	struct xfs_trans		*tp,
+	struct xfs_cui_log_item		*cuip)
+{
+	struct xfs_cud_log_item		*cudp;
+
+	cudp = xfs_cud_init(tp->t_mountp, cuip);
+	xfs_trans_add_item(tp, &cudp->cud_item);
+	return cudp;
+}
+
+/*
+ * Finish an refcount update and log it to the CUD. Note that the
+ * transaction is marked dirty regardless of whether the refcount
+ * update succeeds or fails to support the CUI/CUD lifecycle rules.
+ */
+int
+xfs_trans_log_finish_refcount_update(
+	struct xfs_trans		*tp,
+	struct xfs_cud_log_item		*cudp,
+	struct xfs_defer_ops		*dop,
+	enum xfs_refcount_intent_type	type,
+	xfs_fsblock_t			startblock,
+	xfs_extlen_t			blockcount,
+	xfs_fsblock_t			*new_fsb,
+	xfs_extlen_t			*new_len,
+	struct xfs_btree_cur		**pcur)
+{
+	int				error;
+
+	error = xfs_refcount_finish_one(tp, dop, type, startblock,
+			blockcount, new_fsb, new_len, pcur);
+
+	/*
+	 * Mark the transaction dirty, even on error. This ensures the
+	 * transaction is aborted, which:
+	 *
+	 * 1.) releases the CUI and frees the CUD
+	 * 2.) shuts down the filesystem
+	 */
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	cudp->cud_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+	return error;
+}
+
+/* Sort refcount intents by AG. */
+static int
+xfs_refcount_update_diff_items(
+	void				*priv,
+	struct list_head		*a,
+	struct list_head		*b)
+{
+	struct xfs_mount		*mp = priv;
+	struct xfs_refcount_intent	*ra;
+	struct xfs_refcount_intent	*rb;
+
+	ra = container_of(a, struct xfs_refcount_intent, ri_list);
+	rb = container_of(b, struct xfs_refcount_intent, ri_list);
+	return  XFS_FSB_TO_AGNO(mp, ra->ri_startblock) -
+		XFS_FSB_TO_AGNO(mp, rb->ri_startblock);
+}
+
+/* Get an CUI. */
+STATIC void *
+xfs_refcount_update_create_intent(
+	struct xfs_trans		*tp,
+	unsigned int			count)
+{
+	struct xfs_cui_log_item		*cuip;
+
+	ASSERT(tp != NULL);
+	ASSERT(count > 0);
+
+	cuip = xfs_cui_init(tp->t_mountp, count);
+	ASSERT(cuip != NULL);
+
+	/*
+	 * Get a log_item_desc to point at the new item.
+	 */
+	xfs_trans_add_item(tp, &cuip->cui_item);
+	return cuip;
+}
+
+/* Set the phys extent flags for this reverse mapping. */
+static void
+xfs_trans_set_refcount_flags(
+	struct xfs_phys_extent		*refc,
+	enum xfs_refcount_intent_type	type)
+{
+	refc->pe_flags = 0;
+	switch (type) {
+	case XFS_REFCOUNT_INCREASE:
+	case XFS_REFCOUNT_DECREASE:
+	case XFS_REFCOUNT_ALLOC_COW:
+	case XFS_REFCOUNT_FREE_COW:
+		refc->pe_flags |= type;
+		break;
+	default:
+		ASSERT(0);
+	}
+}
+
+/* Log refcount updates in the intent item. */
+STATIC void
+xfs_refcount_update_log_item(
+	struct xfs_trans		*tp,
+	void				*intent,
+	struct list_head		*item)
+{
+	struct xfs_cui_log_item		*cuip = intent;
+	struct xfs_refcount_intent	*refc;
+	uint				next_extent;
+	struct xfs_phys_extent		*ext;
+
+	refc = container_of(item, struct xfs_refcount_intent, ri_list);
+
+	tp->t_flags |= XFS_TRANS_DIRTY;
+	cuip->cui_item.li_desc->lid_flags |= XFS_LID_DIRTY;
+
+	/*
+	 * atomic_inc_return gives us the value after the increment;
+	 * we want to use it as an array index so we need to subtract 1 from
+	 * it.
+	 */
+	next_extent = atomic_inc_return(&cuip->cui_next_extent) - 1;
+	ASSERT(next_extent < cuip->cui_format.cui_nextents);
+	ext = &cuip->cui_format.cui_extents[next_extent];
+	ext->pe_startblock = refc->ri_startblock;
+	ext->pe_len = refc->ri_blockcount;
+	xfs_trans_set_refcount_flags(ext, refc->ri_type);
+}
+
+/* Get an CUD so we can process all the deferred refcount updates. */
+STATIC void *
+xfs_refcount_update_create_done(
+	struct xfs_trans		*tp,
+	void				*intent,
+	unsigned int			count)
+{
+	return xfs_trans_get_cud(tp, intent);
+}
+
+/* Process a deferred refcount update. */
+STATIC int
+xfs_refcount_update_finish_item(
+	struct xfs_trans		*tp,
+	struct xfs_defer_ops		*dop,
+	struct list_head		*item,
+	void				*done_item,
+	void				**state)
+{
+	struct xfs_refcount_intent	*refc;
+	xfs_fsblock_t			new_fsb;
+	xfs_extlen_t			new_aglen;
+	int				error;
+
+	refc = container_of(item, struct xfs_refcount_intent, ri_list);
+	error = xfs_trans_log_finish_refcount_update(tp, done_item, dop,
+			refc->ri_type,
+			refc->ri_startblock,
+			refc->ri_blockcount,
+			&new_fsb, &new_aglen,
+			(struct xfs_btree_cur **)state);
+	/* Did we run out of reservation?  Requeue what we didn't finish. */
+	if (!error && new_aglen > 0) {
+		ASSERT(refc->ri_type == XFS_REFCOUNT_INCREASE ||
+		       refc->ri_type == XFS_REFCOUNT_DECREASE);
+		refc->ri_startblock = new_fsb;
+		refc->ri_blockcount = new_aglen;
+		return -EAGAIN;
+	}
+	kmem_free(refc);
+	return error;
+}
+
+/* Clean up after processing deferred refcounts. */
+STATIC void
+xfs_refcount_update_finish_cleanup(
+	struct xfs_trans	*tp,
+	void			*state,
+	int			error)
+{
+	struct xfs_btree_cur	*rcur = state;
+
+	xfs_refcount_finish_one_cleanup(tp, rcur, error);
+}
+
+/* Abort all pending CUIs. */
+STATIC void
+xfs_refcount_update_abort_intent(
+	void				*intent)
+{
+	xfs_cui_release(intent);
+}
+
+/* Cancel a deferred refcount update. */
+STATIC void
+xfs_refcount_update_cancel_item(
+	struct list_head		*item)
+{
+	struct xfs_refcount_intent	*refc;
+
+	refc = container_of(item, struct xfs_refcount_intent, ri_list);
+	kmem_free(refc);
+}
+
+static const struct xfs_defer_op_type xfs_refcount_update_defer_type = {
+	.type		= XFS_DEFER_OPS_TYPE_REFCOUNT,
+	.max_items	= XFS_CUI_MAX_FAST_EXTENTS,
+	.diff_items	= xfs_refcount_update_diff_items,
+	.create_intent	= xfs_refcount_update_create_intent,
+	.abort_intent	= xfs_refcount_update_abort_intent,
+	.log_item	= xfs_refcount_update_log_item,
+	.create_done	= xfs_refcount_update_create_done,
+	.finish_item	= xfs_refcount_update_finish_item,
+	.finish_cleanup = xfs_refcount_update_finish_cleanup,
+	.cancel_item	= xfs_refcount_update_cancel_item,
+};
+
+/* Register the deferred op type. */
+void
+xfs_refcount_update_init_defer_op(void)
+{
+	xfs_defer_init_op_type(&xfs_refcount_update_defer_type);
+}
diff --git a/fs/xfs/xfs_trans_rmap.c b/fs/xfs/xfs_trans_rmap.c
index 5a50ef881568..9ead064b5e90 100644
--- a/fs/xfs/xfs_trans_rmap.c
+++ b/fs/xfs/xfs_trans_rmap.c
@@ -48,12 +48,21 @@ xfs_trans_set_rmap_flags(
 	case XFS_RMAP_MAP:
 		rmap->me_flags |= XFS_RMAP_EXTENT_MAP;
 		break;
+	case XFS_RMAP_MAP_SHARED:
+		rmap->me_flags |= XFS_RMAP_EXTENT_MAP_SHARED;
+		break;
 	case XFS_RMAP_UNMAP:
 		rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP;
 		break;
+	case XFS_RMAP_UNMAP_SHARED:
+		rmap->me_flags |= XFS_RMAP_EXTENT_UNMAP_SHARED;
+		break;
 	case XFS_RMAP_CONVERT:
 		rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT;
 		break;
+	case XFS_RMAP_CONVERT_SHARED:
+		rmap->me_flags |= XFS_RMAP_EXTENT_CONVERT_SHARED;
+		break;
 	case XFS_RMAP_ALLOC:
 		rmap->me_flags |= XFS_RMAP_EXTENT_ALLOC;
 		break;
diff --git a/include/asm-generic/uaccess.h b/include/asm-generic/uaccess.h
index 6df9b0749671..cc6bb319e464 100644
--- a/include/asm-generic/uaccess.h
+++ b/include/asm-generic/uaccess.h
@@ -69,10 +69,6 @@ struct exception_table_entry
 	unsigned long insn, fixup;
 };
 
-/* Returns 0 if exception not found and fixup otherwise.  */
-extern unsigned long search_exception_table(unsigned long);
-
-
 /*
  * architectures with an MMU should override these two
  */
diff --git a/include/dt-bindings/thermal/tegra124-soctherm.h b/include/dt-bindings/thermal/tegra124-soctherm.h
index 729ab9fc325e..2a99f1d52bb5 100644
--- a/include/dt-bindings/thermal/tegra124-soctherm.h
+++ b/include/dt-bindings/thermal/tegra124-soctherm.h
@@ -11,4 +11,9 @@
 #define TEGRA124_SOCTHERM_SENSOR_PLLX 3
 #define TEGRA124_SOCTHERM_SENSOR_NUM 4
 
+#define TEGRA_SOCTHERM_THROT_LEVEL_LOW  0
+#define TEGRA_SOCTHERM_THROT_LEVEL_MED  1
+#define TEGRA_SOCTHERM_THROT_LEVEL_HIGH 2
+#define TEGRA_SOCTHERM_THROT_LEVEL_NONE -1
+
 #endif
diff --git a/include/linux/amba/clcd.h b/include/linux/amba/clcd.h
index e82e3ee2c54a..1035879b322c 100644
--- a/include/linux/amba/clcd.h
+++ b/include/linux/amba/clcd.h
@@ -67,6 +67,17 @@
 #define CNTL_LDMAFIFOTIME	(1 << 15)
 #define CNTL_WATERMARK		(1 << 16)
 
+/* ST Microelectronics variant bits */
+#define CNTL_ST_1XBPP_444	0x0
+#define CNTL_ST_1XBPP_5551	(1 << 17)
+#define CNTL_ST_1XBPP_565	(1 << 18)
+#define CNTL_ST_CDWID_12	0x0
+#define CNTL_ST_CDWID_16	(1 << 19)
+#define CNTL_ST_CDWID_18	(1 << 20)
+#define CNTL_ST_CDWID_24	((1 << 19)|(1 << 20))
+#define CNTL_ST_CEAEN		(1 << 21)
+#define CNTL_ST_LCDBPP24_PACKED	(6 << 1)
+
 enum {
 	/* individual formats */
 	CLCD_CAP_RGB444		= (1 << 0),
@@ -93,6 +104,8 @@ enum {
 	CLCD_CAP_ALL		= CLCD_CAP_BGR | CLCD_CAP_RGB,
 };
 
+struct backlight_device;
+
 struct clcd_panel {
 	struct fb_videomode	mode;
 	signed short		width;	/* width in mm */
@@ -105,6 +118,13 @@ struct clcd_panel {
 				fixedtimings:1,
 				grayscale:1;
 	unsigned int		connector;
+	struct backlight_device	*backlight;
+	/*
+	 * If the B/R lines are switched between the CLCD
+	 * and the panel we need to know this and not try to
+	 * compensate with the BGR bit in the control register.
+	 */
+	bool			bgr_connection;
 };
 
 struct clcd_regs {
@@ -170,11 +190,38 @@ struct clcd_board {
 struct amba_device;
 struct clk;
 
+/**
+ * struct clcd_vendor_data - holds hardware (IP-block) vendor-specific
+ * variant information
+ *
+ * @clock_timregs: the CLCD needs to be clocked when accessing the
+ * timer registers, or the hardware will hang.
+ * @packed_24_bit_pixels: this variant supports 24bit packed pixel data,
+ * so that RGB accesses 3 bytes at a time, not just on even 32bit
+ * boundaries, packing the pixel data in memory. ST Microelectronics
+ * have this.
+ * @st_bitmux_control: ST Microelectronics have implemented output
+ * bit line multiplexing into the CLCD control register. This indicates
+ * that we need to use this.
+ * @init_board: custom board init function for this variant
+ * @init_panel: custom panel init function for this variant
+ */
+struct clcd_vendor_data {
+	bool	clock_timregs;
+	bool	packed_24_bit_pixels;
+	bool	st_bitmux_control;
+	int	(*init_board)(struct amba_device *adev,
+			      struct clcd_board *board);
+	int	(*init_panel)(struct clcd_fb *fb,
+			      struct device_node *panel);
+};
+
 /* this data structure describes each frame buffer device we find */
 struct clcd_fb {
 	struct fb_info		fb;
 	struct amba_device	*dev;
 	struct clk		*clk;
+	struct clcd_vendor_data	*vendor;
 	struct clcd_panel	*panel;
 	struct clcd_board	*board;
 	void			*board_data;
@@ -231,16 +278,22 @@ static inline void clcdfb_decode(struct clcd_fb *fb, struct clcd_regs *regs)
 	if (var->grayscale)
 		val |= CNTL_LCDBW;
 
-	if (fb->panel->caps && fb->board->caps &&
-	    var->bits_per_pixel >= 16) {
+	if (fb->panel->caps && fb->board->caps && var->bits_per_pixel >= 16) {
 		/*
 		 * if board and panel supply capabilities, we can support
-		 * changing BGR/RGB depending on supplied parameters
+		 * changing BGR/RGB depending on supplied parameters. Here
+		 * we switch to what the framebuffer is providing if need
+		 * be, so if the framebuffer is BGR but the display connection
+		 * is RGB (first case) we switch it around. Vice versa mutatis
+		 * mutandis if the framebuffer is RGB but the display connection
+		 * is BGR, we flip it around.
 		 */
 		if (var->red.offset == 0)
 			val &= ~CNTL_BGR;
 		else
 			val |= CNTL_BGR;
+		if (fb->panel->bgr_connection)
+			val ^= CNTL_BGR;
 	}
 
 	switch (var->bits_per_pixel) {
@@ -270,6 +323,10 @@ static inline void clcdfb_decode(struct clcd_fb *fb, struct clcd_regs *regs)
 		else
 			val |= CNTL_LCDBPP16_444;
 		break;
+	case 24:
+		/* Modified variant supporting 24 bit packed pixels */
+		val |= CNTL_ST_LCDBPP24_PACKED;
+		break;
 	case 32:
 		val |= CNTL_LCDBPP24;
 		break;
diff --git a/include/linux/exportfs.h b/include/linux/exportfs.h
index b03c0625fa6e..5ab958cdc50b 100644
--- a/include/linux/exportfs.h
+++ b/include/linux/exportfs.h
@@ -157,12 +157,13 @@ struct fid {
  *    @fh_to_dentry is given a &struct super_block (@sb) and a file handle
  *    fragment (@fh, @fh_len). It should return a &struct dentry which refers
  *    to the same file that the file handle fragment refers to.  If it cannot,
- *    it should return a %NULL pointer if the file was found but no acceptable
- *    &dentries were available, or an %ERR_PTR error code indicating why it
- *    couldn't be found (e.g. %ENOENT or %ENOMEM).  Any suitable dentry can be
- *    returned including, if necessary, a new dentry created with d_alloc_root.
- *    The caller can then find any other extant dentries by following the
- *    d_alias links.
+ *    it should return a %NULL pointer if the file cannot be found, or an
+ *    %ERR_PTR error code of %ENOMEM if a memory allocation failure occurred.
+ *    Any other error code is treated like %NULL, and will cause an %ESTALE error
+ *    for callers of exportfs_decode_fh().
+ *    Any suitable dentry can be returned including, if necessary, a new dentry
+ *    created with d_alloc_root.  The caller can then find any other extant
+ *    dentries by following the d_alias links.
  *
  * fh_to_parent:
  *    Same as @fh_to_dentry, except that it returns a pointer to the parent
diff --git a/include/linux/falloc.h b/include/linux/falloc.h
index 996111000a8c..7494dc67c66f 100644
--- a/include/linux/falloc.h
+++ b/include/linux/falloc.h
@@ -25,6 +25,7 @@ struct space_resv {
 					 FALLOC_FL_PUNCH_HOLE |		\
 					 FALLOC_FL_COLLAPSE_RANGE |	\
 					 FALLOC_FL_ZERO_RANGE |		\
-					 FALLOC_FL_INSERT_RANGE)
+					 FALLOC_FL_INSERT_RANGE |	\
+					 FALLOC_FL_UNSHARE_RANGE)
 
 #endif /* _FALLOC_H_ */
diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h
index 77c141797152..58276144ba81 100644
--- a/include/linux/mlx5/device.h
+++ b/include/linux/mlx5/device.h
@@ -92,12 +92,21 @@ __mlx5_mask(typ, fld))
 	___t; \
 })
 
-#define MLX5_SET64(typ, p, fld, v) do { \
+#define __MLX5_SET64(typ, p, fld, v) do { \
 	BUILD_BUG_ON(__mlx5_bit_sz(typ, fld) != 64); \
-	BUILD_BUG_ON(__mlx5_bit_off(typ, fld) % 64); \
 	*((__be64 *)(p) + __mlx5_64_off(typ, fld)) = cpu_to_be64(v); \
 } while (0)
 
+#define MLX5_SET64(typ, p, fld, v) do { \
+	BUILD_BUG_ON(__mlx5_bit_off(typ, fld) % 64); \
+	__MLX5_SET64(typ, p, fld, v); \
+} while (0)
+
+#define MLX5_ARRAY_SET64(typ, p, fld, idx, v) do { \
+	BUILD_BUG_ON(__mlx5_bit_off(typ, fld) % 64); \
+	__MLX5_SET64(typ, p, fld[idx], v); \
+} while (0)
+
 #define MLX5_GET64(typ, p, fld) be64_to_cpu(*((__be64 *)(p) + __mlx5_64_off(typ, fld)))
 
 #define MLX5_GET64_PR(typ, p, fld) ({ \
diff --git a/include/linux/nfs4.h b/include/linux/nfs4.h
index c6564ada9beb..9094faf0699d 100644
--- a/include/linux/nfs4.h
+++ b/include/linux/nfs4.h
@@ -67,6 +67,7 @@ struct nfs4_stateid_struct {
 		NFS4_DELEGATION_STATEID_TYPE,
 		NFS4_LAYOUT_STATEID_TYPE,
 		NFS4_PNFS_DS_STATEID_TYPE,
+		NFS4_REVOKED_STATEID_TYPE,
 	} type;
 };
 
diff --git a/include/linux/nfs_fs_sb.h b/include/linux/nfs_fs_sb.h
index 14a762d2734d..b34097c67848 100644
--- a/include/linux/nfs_fs_sb.h
+++ b/include/linux/nfs_fs_sb.h
@@ -103,6 +103,9 @@ struct nfs_client {
 #define NFS_SP4_MACH_CRED_WRITE    5	/* WRITE */
 #define NFS_SP4_MACH_CRED_COMMIT   6	/* COMMIT */
 #define NFS_SP4_MACH_CRED_PNFS_CLEANUP  7 /* LAYOUTRETURN */
+#if IS_ENABLED(CONFIG_NFS_V4_1)
+	wait_queue_head_t	cl_lock_waitq;
+#endif /* CONFIG_NFS_V4_1 */
 #endif /* CONFIG_NFS_V4 */
 
 	/* Our own IP address, as a null-terminated string.
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 7cc0deee5bde..beb1e10f446e 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -125,6 +125,11 @@ struct nfs_fattr {
 		| NFS_ATTR_FATTR_V4_SECURITY_LABEL)
 
 /*
+ * Maximal number of supported layout drivers.
+ */
+#define NFS_MAX_LAYOUT_TYPES 8
+
+/*
  * Info on the file system
  */
 struct nfs_fsinfo {
@@ -139,7 +144,8 @@ struct nfs_fsinfo {
 	__u64			maxfilesize;
 	struct timespec		time_delta; /* server time granularity */
 	__u32			lease_time; /* in seconds */
-	__u32			layouttype; /* supported pnfs layout driver */
+	__u32			nlayouttypes; /* number of layouttypes */
+	__u32			layouttype[NFS_MAX_LAYOUT_TYPES]; /* supported pnfs layout driver */
 	__u32			blksize; /* preferred pnfs io block size */
 	__u32			clone_blksize; /* granularity of a CLONE operation */
 };
diff --git a/include/linux/pwm.h b/include/linux/pwm.h
index f1bbae014889..2c6c5114c089 100644
--- a/include/linux/pwm.h
+++ b/include/linux/pwm.h
@@ -641,6 +641,7 @@ static inline void pwm_remove_table(struct pwm_lookup *table, size_t num)
 #ifdef CONFIG_PWM_SYSFS
 void pwmchip_sysfs_export(struct pwm_chip *chip);
 void pwmchip_sysfs_unexport(struct pwm_chip *chip);
+void pwmchip_sysfs_unexport_children(struct pwm_chip *chip);
 #else
 static inline void pwmchip_sysfs_export(struct pwm_chip *chip)
 {
@@ -649,6 +650,10 @@ static inline void pwmchip_sysfs_export(struct pwm_chip *chip)
 static inline void pwmchip_sysfs_unexport(struct pwm_chip *chip)
 {
 }
+
+static inline void pwmchip_sysfs_unexport_children(struct pwm_chip *chip)
+{
+}
 #endif /* CONFIG_PWM_SYSFS */
 
 #endif /* __LINUX_PWM_H */
diff --git a/include/linux/sunrpc/auth.h b/include/linux/sunrpc/auth.h
index 4ccf184e971f..b1bc62ba20a2 100644
--- a/include/linux/sunrpc/auth.h
+++ b/include/linux/sunrpc/auth.h
@@ -131,6 +131,7 @@ struct rpc_authops {
 	struct rpc_auth *	(*create)(struct rpc_auth_create_args *, struct rpc_clnt *);
 	void			(*destroy)(struct rpc_auth *);
 
+	int			(*hash_cred)(struct auth_cred *, unsigned int);
 	struct rpc_cred *	(*lookup_cred)(struct rpc_auth *, struct auth_cred *, int);
 	struct rpc_cred *	(*crcreate)(struct rpc_auth*, struct auth_cred *, int, gfp_t);
 	int			(*list_pseudoflavors)(rpc_authflavor_t *, int);
diff --git a/include/linux/sunrpc/clnt.h b/include/linux/sunrpc/clnt.h
index 5c02b0691587..85cc819676e8 100644
--- a/include/linux/sunrpc/clnt.h
+++ b/include/linux/sunrpc/clnt.h
@@ -125,6 +125,13 @@ struct rpc_create_args {
 	struct svc_xprt		*bc_xprt;	/* NFSv4.1 backchannel */
 };
 
+struct rpc_add_xprt_test {
+	int (*add_xprt_test)(struct rpc_clnt *,
+		struct rpc_xprt *,
+		void *calldata);
+	void *data;
+};
+
 /* Values for "flags" field */
 #define RPC_CLNT_CREATE_HARDRTRY	(1UL << 0)
 #define RPC_CLNT_CREATE_AUTOBIND	(1UL << 2)
@@ -198,6 +205,16 @@ int		rpc_clnt_add_xprt(struct rpc_clnt *, struct xprt_create *,
 void		rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt,
 			unsigned long timeo);
 
+int		rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *,
+			struct rpc_xprt_switch *,
+			struct rpc_xprt *,
+			void *);
+
 const char *rpc_proc_name(const struct rpc_task *task);
+
+void rpc_clnt_xprt_switch_put(struct rpc_clnt *);
+void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *, struct rpc_xprt *);
+bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
+			const struct sockaddr *sap);
 #endif /* __KERNEL__ */
 #endif /* _LINUX_SUNRPC_CLNT_H */
diff --git a/include/linux/sunrpc/rpc_rdma.h b/include/linux/sunrpc/rpc_rdma.h
index 3b1ff38f0c37..cfda6adcf33c 100644
--- a/include/linux/sunrpc/rpc_rdma.h
+++ b/include/linux/sunrpc/rpc_rdma.h
@@ -41,10 +41,15 @@
 #define _LINUX_SUNRPC_RPC_RDMA_H
 
 #include <linux/types.h>
+#include <linux/bitops.h>
 
 #define RPCRDMA_VERSION		1
 #define rpcrdma_version		cpu_to_be32(RPCRDMA_VERSION)
 
+enum {
+	RPCRDMA_V1_DEF_INLINE_SIZE	= 1024,
+};
+
 struct rpcrdma_segment {
 	__be32 rs_handle;	/* Registered memory handle */
 	__be32 rs_length;	/* Length of the chunk in bytes */
@@ -129,4 +134,38 @@ enum rpcrdma_proc {
 #define rdma_done	cpu_to_be32(RDMA_DONE)
 #define rdma_error	cpu_to_be32(RDMA_ERROR)
 
+/*
+ * Private extension to RPC-over-RDMA Version One.
+ * Message passed during RDMA-CM connection set-up.
+ *
+ * Add new fields at the end, and don't permute existing
+ * fields.
+ */
+struct rpcrdma_connect_private {
+	__be32			cp_magic;
+	u8			cp_version;
+	u8			cp_flags;
+	u8			cp_send_size;
+	u8			cp_recv_size;
+} __packed;
+
+#define rpcrdma_cmp_magic	__cpu_to_be32(0xf6ab0e18)
+
+enum {
+	RPCRDMA_CMP_VERSION		= 1,
+	RPCRDMA_CMP_F_SND_W_INV_OK	= BIT(0),
+};
+
+static inline u8
+rpcrdma_encode_buffer_size(unsigned int size)
+{
+	return (size >> 10) - 1;
+}
+
+static inline unsigned int
+rpcrdma_decode_buffer_size(u8 val)
+{
+	return ((unsigned int)val + 1) << 10;
+}
+
 #endif				/* _LINUX_SUNRPC_RPC_RDMA_H */
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index 817af0b4385e..7ba040c797ec 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -239,8 +239,8 @@ struct rpc_task *rpc_wake_up_first(struct rpc_wait_queue *,
 					void *);
 void		rpc_wake_up_status(struct rpc_wait_queue *, int);
 void		rpc_delay(struct rpc_task *, unsigned long);
-void *		rpc_malloc(struct rpc_task *, size_t);
-void		rpc_free(void *);
+int		rpc_malloc(struct rpc_task *);
+void		rpc_free(struct rpc_task *);
 int		rpciod_up(void);
 void		rpciod_down(void);
 int		__rpc_wait_for_completion_task(struct rpc_task *task, wait_bit_action_f *);
diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h
index d6917b896d3a..cc3ae16eac68 100644
--- a/include/linux/sunrpc/svc_rdma.h
+++ b/include/linux/sunrpc/svc_rdma.h
@@ -86,6 +86,7 @@ struct svc_rdma_op_ctxt {
 	unsigned long flags;
 	enum dma_data_direction direction;
 	int count;
+	unsigned int mapped_sges;
 	struct ib_sge sge[RPCSVC_MAXPAGES];
 	struct page *pages[RPCSVC_MAXPAGES];
 };
@@ -136,6 +137,7 @@ struct svcxprt_rdma {
 	int		     sc_ord;		/* RDMA read limit */
 	int                  sc_max_sge;
 	int                  sc_max_sge_rd;	/* max sge for read target */
+	bool		     sc_snd_w_inv;	/* OK to use Send With Invalidate */
 
 	atomic_t             sc_sq_count;	/* Number of SQ WR on queue */
 	unsigned int	     sc_sq_depth;	/* Depth of SQ */
@@ -193,6 +195,14 @@ struct svcxprt_rdma {
 
 #define RPCSVC_MAXPAYLOAD_RDMA	RPCSVC_MAXPAYLOAD
 
+/* Track DMA maps for this transport and context */
+static inline void svc_rdma_count_mappings(struct svcxprt_rdma *rdma,
+					   struct svc_rdma_op_ctxt *ctxt)
+{
+	ctxt->mapped_sges++;
+	atomic_inc(&rdma->sc_dma_used);
+}
+
 /* svc_rdma_backchannel.c */
 extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt,
 				    struct rpcrdma_msg *rmsgp,
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 70c6b92e15a7..56c48c884a24 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -67,6 +67,18 @@ struct xdr_buf {
 			len;		/* Length of XDR encoded message */
 };
 
+static inline void
+xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
+{
+	buf->head[0].iov_base = start;
+	buf->head[0].iov_len = len;
+	buf->tail[0].iov_len = 0;
+	buf->page_len = 0;
+	buf->flags = 0;
+	buf->len = 0;
+	buf->buflen = len;
+}
+
 /*
  * pre-xdr'ed macros.
  */
diff --git a/include/linux/sunrpc/xprt.h b/include/linux/sunrpc/xprt.h
index a16070dd03ee..a5da60b24d83 100644
--- a/include/linux/sunrpc/xprt.h
+++ b/include/linux/sunrpc/xprt.h
@@ -83,9 +83,11 @@ struct rpc_rqst {
 	void (*rq_release_snd_buf)(struct rpc_rqst *); /* release rq_enc_pages */
 	struct list_head	rq_list;
 
-	__u32 *			rq_buffer;	/* XDR encode buffer */
-	size_t			rq_callsize,
-				rq_rcvsize;
+	void			*rq_xprtdata;	/* Per-xprt private data */
+	void			*rq_buffer;	/* Call XDR encode buffer */
+	size_t			rq_callsize;
+	void			*rq_rbuffer;	/* Reply XDR decode buffer */
+	size_t			rq_rcvsize;
 	size_t			rq_xmit_bytes_sent;	/* total bytes sent */
 	size_t			rq_reply_bytes_recvd;	/* total reply bytes */
 							/* received */
@@ -127,8 +129,8 @@ struct rpc_xprt_ops {
 	void		(*rpcbind)(struct rpc_task *task);
 	void		(*set_port)(struct rpc_xprt *xprt, unsigned short port);
 	void		(*connect)(struct rpc_xprt *xprt, struct rpc_task *task);
-	void *		(*buf_alloc)(struct rpc_task *task, size_t size);
-	void		(*buf_free)(void *buffer);
+	int		(*buf_alloc)(struct rpc_task *task);
+	void		(*buf_free)(struct rpc_task *task);
 	int		(*send_request)(struct rpc_task *task);
 	void		(*set_retrans_timeout)(struct rpc_task *task);
 	void		(*timer)(struct rpc_xprt *xprt, struct rpc_task *task);
diff --git a/include/linux/sunrpc/xprtmultipath.h b/include/linux/sunrpc/xprtmultipath.h
index 5a9acffa41be..507418c1c69e 100644
--- a/include/linux/sunrpc/xprtmultipath.h
+++ b/include/linux/sunrpc/xprtmultipath.h
@@ -66,4 +66,6 @@ extern struct rpc_xprt *xprt_iter_xprt(struct rpc_xprt_iter *xpi);
 extern struct rpc_xprt *xprt_iter_get_xprt(struct rpc_xprt_iter *xpi);
 extern struct rpc_xprt *xprt_iter_get_next(struct rpc_xprt_iter *xpi);
 
+extern bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps,
+		const struct sockaddr *sap);
 #endif
diff --git a/include/linux/sunrpc/xprtrdma.h b/include/linux/sunrpc/xprtrdma.h
index 39267dc3486a..221b7a2e5406 100644
--- a/include/linux/sunrpc/xprtrdma.h
+++ b/include/linux/sunrpc/xprtrdma.h
@@ -53,8 +53,8 @@
 #define RPCRDMA_MAX_SLOT_TABLE	(256U)
 
 #define RPCRDMA_MIN_INLINE  (1024)	/* min inline thresh */
-#define RPCRDMA_DEF_INLINE  (1024)	/* default inline thresh */
-#define RPCRDMA_MAX_INLINE  (3068)	/* max inline thresh */
+#define RPCRDMA_DEF_INLINE  (4096)	/* default inline thresh */
+#define RPCRDMA_MAX_INLINE  (65536)	/* max inline thresh */
 
 /* Memory registration strategies, by number.
  * This is part of a kernel / user space API. Do not remove. */
diff --git a/include/linux/thermal.h b/include/linux/thermal.h
index ee517bef0db0..511182a88e76 100644
--- a/include/linux/thermal.h
+++ b/include/linux/thermal.h
@@ -92,12 +92,24 @@ enum thermal_trend {
 	THERMAL_TREND_DROP_FULL, /* apply lowest cooling action */
 };
 
+/* Thermal notification reason */
+enum thermal_notify_event {
+	THERMAL_EVENT_UNSPECIFIED, /* Unspecified event */
+	THERMAL_EVENT_TEMP_SAMPLE, /* New Temperature sample */
+	THERMAL_TRIP_VIOLATED, /* TRIP Point violation */
+	THERMAL_TRIP_CHANGED, /* TRIP Point temperature changed */
+	THERMAL_DEVICE_DOWN, /* Thermal device is down */
+	THERMAL_DEVICE_UP, /* Thermal device is up after a down event */
+	THERMAL_DEVICE_POWER_CAPABILITY_CHANGED, /* power capability changed */
+};
+
 struct thermal_zone_device_ops {
 	int (*bind) (struct thermal_zone_device *,
 		     struct thermal_cooling_device *);
 	int (*unbind) (struct thermal_zone_device *,
 		       struct thermal_cooling_device *);
 	int (*get_temp) (struct thermal_zone_device *, int *);
+	int (*set_trips) (struct thermal_zone_device *, int, int);
 	int (*get_mode) (struct thermal_zone_device *,
 			 enum thermal_device_mode *);
 	int (*set_mode) (struct thermal_zone_device *,
@@ -168,6 +180,10 @@ struct thermal_attr {
  * @last_temperature:	previous temperature read
  * @emul_temperature:	emulated temperature when using CONFIG_THERMAL_EMULATION
  * @passive:		1 if you've crossed a passive trip point, 0 otherwise.
+ * @prev_low_trip:	the low current temperature if you've crossed a passive
+			trip point.
+ * @prev_high_trip:	the above current temperature if you've crossed a
+			passive trip point.
  * @forced_passive:	If > 0, temperature at which to switch on all ACPI
  *			processor cooling devices.  Currently only used by the
  *			step-wise governor.
@@ -182,6 +198,7 @@ struct thermal_attr {
  * @lock:	lock to protect thermal_instances list
  * @node:	node in thermal_tz_list (in thermal_core.c)
  * @poll_queue:	delayed work for polling
+ * @notify_event: Last notification event
  */
 struct thermal_zone_device {
 	int id;
@@ -199,6 +216,8 @@ struct thermal_zone_device {
 	int last_temperature;
 	int emul_temperature;
 	int passive;
+	int prev_low_trip;
+	int prev_high_trip;
 	unsigned int forced_passive;
 	atomic_t need_update;
 	struct thermal_zone_device_ops *ops;
@@ -210,6 +229,7 @@ struct thermal_zone_device {
 	struct mutex lock;
 	struct list_head node;
 	struct delayed_work poll_queue;
+	enum thermal_notify_event notify_event;
 };
 
 /**
@@ -333,6 +353,9 @@ struct thermal_genl_event {
  *
  * Optional:
  * @get_trend: a pointer to a function that reads the sensor temperature trend.
+ * @set_trips: a pointer to a function that sets a temperature window. When
+ *	       this window is left the driver must inform the thermal core via
+ *	       thermal_zone_device_update.
  * @set_emul_temp: a pointer to a function that sets sensor emulated
  *		   temperature.
  * @set_trip_temp: a pointer to a function that sets the trip temperature on
@@ -340,7 +363,8 @@ struct thermal_genl_event {
  */
 struct thermal_zone_of_device_ops {
 	int (*get_temp)(void *, int *);
-	int (*get_trend)(void *, long *);
+	int (*get_trend)(void *, int, enum thermal_trend *);
+	int (*set_trips)(void *, int, int);
 	int (*set_emul_temp)(void *, int);
 	int (*set_trip_temp)(void *, int, int);
 };
@@ -425,7 +449,9 @@ int thermal_zone_bind_cooling_device(struct thermal_zone_device *, int,
 				     unsigned int);
 int thermal_zone_unbind_cooling_device(struct thermal_zone_device *, int,
 				       struct thermal_cooling_device *);
-void thermal_zone_device_update(struct thermal_zone_device *);
+void thermal_zone_device_update(struct thermal_zone_device *,
+				enum thermal_notify_event);
+void thermal_zone_set_trips(struct thermal_zone_device *);
 
 struct thermal_cooling_device *thermal_cooling_device_register(char *, void *,
 		const struct thermal_cooling_device_ops *);
@@ -435,6 +461,8 @@ thermal_of_cooling_device_register(struct device_node *np, char *, void *,
 void thermal_cooling_device_unregister(struct thermal_cooling_device *);
 struct thermal_zone_device *thermal_zone_get_zone_by_name(const char *name);
 int thermal_zone_get_temp(struct thermal_zone_device *tz, int *temp);
+int thermal_zone_get_slope(struct thermal_zone_device *tz);
+int thermal_zone_get_offset(struct thermal_zone_device *tz);
 
 int get_tz_trend(struct thermal_zone_device *, int);
 struct thermal_instance *get_thermal_instance(struct thermal_zone_device *,
@@ -473,7 +501,10 @@ static inline int thermal_zone_unbind_cooling_device(
 	struct thermal_zone_device *tz, int trip,
 	struct thermal_cooling_device *cdev)
 { return -ENODEV; }
-static inline void thermal_zone_device_update(struct thermal_zone_device *tz)
+static inline void thermal_zone_device_update(struct thermal_zone_device *tz,
+					      enum thermal_notify_event event)
+{ }
+static inline void thermal_zone_set_trips(struct thermal_zone_device *tz)
 { }
 static inline struct thermal_cooling_device *
 thermal_cooling_device_register(char *type, void *devdata,
@@ -492,6 +523,12 @@ static inline struct thermal_zone_device *thermal_zone_get_zone_by_name(
 static inline int thermal_zone_get_temp(
 		struct thermal_zone_device *tz, int *temp)
 { return -ENODEV; }
+static inline int thermal_zone_get_slope(
+		struct thermal_zone_device *tz)
+{ return -ENODEV; }
+static inline int thermal_zone_get_offset(
+		struct thermal_zone_device *tz)
+{ return -ENODEV; }
 static inline int get_tz_trend(struct thermal_zone_device *tz, int trip)
 { return -ENODEV; }
 static inline struct thermal_instance *
diff --git a/include/linux/watchdog.h b/include/linux/watchdog.h
index 7047bc7f8106..35a4d8185b51 100644
--- a/include/linux/watchdog.h
+++ b/include/linux/watchdog.h
@@ -19,6 +19,7 @@
 struct watchdog_ops;
 struct watchdog_device;
 struct watchdog_core_data;
+struct watchdog_governor;
 
 /** struct watchdog_ops - The watchdog-devices operations
  *
@@ -28,6 +29,7 @@ struct watchdog_core_data;
  * @ping:	The routine that sends a keepalive ping to the watchdog device.
  * @status:	The routine that shows the status of the watchdog device.
  * @set_timeout:The routine for setting the watchdog devices timeout value (in seconds).
+ * @set_pretimeout:The routine for setting the watchdog devices pretimeout.
  * @get_timeleft:The routine that gets the time left before a reset (in seconds).
  * @restart:	The routine for restarting the machine.
  * @ioctl:	The routines that handles extra ioctl calls.
@@ -46,6 +48,7 @@ struct watchdog_ops {
 	int (*ping)(struct watchdog_device *);
 	unsigned int (*status)(struct watchdog_device *);
 	int (*set_timeout)(struct watchdog_device *, unsigned int);
+	int (*set_pretimeout)(struct watchdog_device *, unsigned int);
 	unsigned int (*get_timeleft)(struct watchdog_device *);
 	int (*restart)(struct watchdog_device *, unsigned long, void *);
 	long (*ioctl)(struct watchdog_device *, unsigned int, unsigned long);
@@ -59,8 +62,10 @@ struct watchdog_ops {
  *		watchdog device.
  * @info:	Pointer to a watchdog_info structure.
  * @ops:	Pointer to the list of watchdog operations.
+ * @gov:	Pointer to watchdog pretimeout governor.
  * @bootstatus:	Status of the watchdog device at boot.
  * @timeout:	The watchdog devices timeout value (in seconds).
+ * @pretimeout: The watchdog devices pre_timeout value.
  * @min_timeout:The watchdog devices minimum timeout value (in seconds).
  * @max_timeout:The watchdog devices maximum timeout value (in seconds)
  *		as configurable from user space. Only relevant if
@@ -94,8 +99,10 @@ struct watchdog_device {
 	const struct attribute_group **groups;
 	const struct watchdog_info *info;
 	const struct watchdog_ops *ops;
+	const struct watchdog_governor *gov;
 	unsigned int bootstatus;
 	unsigned int timeout;
+	unsigned int pretimeout;
 	unsigned int min_timeout;
 	unsigned int max_timeout;
 	unsigned int min_hw_heartbeat_ms;
@@ -163,6 +170,13 @@ static inline bool watchdog_timeout_invalid(struct watchdog_device *wdd, unsigne
 		 t > wdd->max_timeout);
 }
 
+/* Use the following function to check if a pretimeout value is invalid */
+static inline bool watchdog_pretimeout_invalid(struct watchdog_device *wdd,
+					       unsigned int t)
+{
+	return t && wdd->timeout && t >= wdd->timeout;
+}
+
 /* Use the following functions to manipulate watchdog driver specific data */
 static inline void watchdog_set_drvdata(struct watchdog_device *wdd, void *data)
 {
@@ -174,6 +188,16 @@ static inline void *watchdog_get_drvdata(struct watchdog_device *wdd)
 	return wdd->driver_data;
 }
 
+/* Use the following functions to report watchdog pretimeout event */
+#if IS_ENABLED(CONFIG_WATCHDOG_PRETIMEOUT_GOV)
+void watchdog_notify_pretimeout(struct watchdog_device *wdd);
+#else
+static inline void watchdog_notify_pretimeout(struct watchdog_device *wdd)
+{
+	pr_alert("watchdog%d: pretimeout event\n", wdd->id);
+}
+#endif
+
 /* drivers/watchdog/watchdog_core.c */
 void watchdog_set_restart_priority(struct watchdog_device *wdd, int priority);
 extern int watchdog_init_timeout(struct watchdog_device *wdd,
diff --git a/include/net/bonding.h b/include/net/bonding.h
index 6360c259da6d..f32f7ef8a23a 100644
--- a/include/net/bonding.h
+++ b/include/net/bonding.h
@@ -37,18 +37,6 @@
 #ifndef __long_aligned
 #define __long_aligned __attribute__((aligned((sizeof(long)))))
 #endif
-/*
- * Less bad way to call ioctl from within the kernel; this needs to be
- * done some other way to get the call out of interrupt context.
- * Needs "ioctl" variable to be supplied by calling context.
- */
-#define IOCTL(dev, arg, cmd) ({		\
-	int res = 0;			\
-	mm_segment_t fs = get_fs();	\
-	set_fs(get_ds());		\
-	res = ioctl(dev, arg, cmd);	\
-	set_fs(fs);			\
-	res; })
 
 #define BOND_MODE(bond) ((bond)->params.mode)
 
diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h
index b220dabeab45..3832099289c5 100644
--- a/include/net/l3mdev.h
+++ b/include/net/l3mdev.h
@@ -114,6 +114,25 @@ static inline u32 l3mdev_fib_table(const struct net_device *dev)
 	return tb_id;
 }
 
+static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
+{
+	struct net_device *dev;
+	bool rc = false;
+
+	if (ifindex == 0)
+		return false;
+
+	rcu_read_lock();
+
+	dev = dev_get_by_index_rcu(net, ifindex);
+	if (dev)
+		rc = netif_is_l3_master(dev);
+
+	rcu_read_unlock();
+
+	return rc;
+}
+
 struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6);
 
 static inline
@@ -207,6 +226,11 @@ static inline u32 l3mdev_fib_table_by_index(struct net *net, int ifindex)
 	return 0;
 }
 
+static inline bool netif_index_is_l3_master(struct net *net, int ifindex)
+{
+	return false;
+}
+
 static inline
 struct dst_entry *l3mdev_link_scope_lookup(struct net *net, struct flowi6 *fl6)
 {
diff --git a/include/uapi/linux/falloc.h b/include/uapi/linux/falloc.h
index 3e445a760f14..b075f601919b 100644
--- a/include/uapi/linux/falloc.h
+++ b/include/uapi/linux/falloc.h
@@ -58,4 +58,22 @@
  */
 #define FALLOC_FL_INSERT_RANGE		0x20
 
+/*
+ * FALLOC_FL_UNSHARE_RANGE is used to unshare shared blocks within the
+ * file size without overwriting any existing data. The purpose of this
+ * call is to preemptively reallocate any blocks that are subject to
+ * copy-on-write.
+ *
+ * Different filesystems may implement different limitations on the
+ * granularity of the operation. Most will limit operations to filesystem
+ * block size boundaries, but this boundary may be larger or smaller
+ * depending on the filesystem and/or the configuration of the filesystem
+ * or file.
+ *
+ * This flag can only be used with allocate-mode fallocate, which is
+ * to say that it cannot be used with the punch, zero, collapse, or
+ * insert range modes.
+ */
+#define FALLOC_FL_UNSHARE_RANGE		0x40
+
 #endif /* _UAPI_FALLOC_H_ */
diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h
index 2473272169f2..acb2b6152ba0 100644
--- a/include/uapi/linux/fs.h
+++ b/include/uapi/linux/fs.h
@@ -158,7 +158,8 @@ struct fsxattr {
 	__u32		fsx_extsize;	/* extsize field value (get/set)*/
 	__u32		fsx_nextents;	/* nextents field value (get)	*/
 	__u32		fsx_projid;	/* project identifier (get/set) */
-	unsigned char	fsx_pad[12];
+	__u32		fsx_cowextsize;	/* CoW extsize field value (get/set)*/
+	unsigned char	fsx_pad[8];
 };
 
 /*
@@ -179,6 +180,7 @@ struct fsxattr {
 #define FS_XFLAG_NODEFRAG	0x00002000	/* do not defragment */
 #define FS_XFLAG_FILESTREAM	0x00004000	/* use filestream allocator */
 #define FS_XFLAG_DAX		0x00008000	/* use DAX for IO */
+#define FS_XFLAG_COWEXTSIZE	0x00010000	/* CoW extent size allocator hint */
 #define FS_XFLAG_HASATTR	0x80000000	/* no DIFLAG for this	*/
 
 /* the read-only stuff doesn't really belong here, but any other place is
diff --git a/include/uapi/linux/nfs4.h b/include/uapi/linux/nfs4.h
index 2b871e0858d9..4ae62796bfde 100644
--- a/include/uapi/linux/nfs4.h
+++ b/include/uapi/linux/nfs4.h
@@ -39,8 +39,9 @@
 #define NFS4_FH_VOL_MIGRATION		0x0004
 #define NFS4_FH_VOL_RENAME		0x0008
 
-#define NFS4_OPEN_RESULT_CONFIRM 0x0002
-#define NFS4_OPEN_RESULT_LOCKTYPE_POSIX 0x0004
+#define NFS4_OPEN_RESULT_CONFIRM		0x0002
+#define NFS4_OPEN_RESULT_LOCKTYPE_POSIX		0x0004
+#define NFS4_OPEN_RESULT_MAY_NOTIFY_LOCK	0x0020
 
 #define NFS4_SHARE_ACCESS_MASK	0x000F
 #define NFS4_SHARE_ACCESS_READ	0x0001
diff --git a/include/video/exynos_mipi_dsim.h b/include/video/exynos_mipi_dsim.h
deleted file mode 100644
index 6a578f8a1b3e..000000000000
--- a/include/video/exynos_mipi_dsim.h
+++ /dev/null
@@ -1,358 +0,0 @@
-/* include/video/exynos_mipi_dsim.h
- *
- * Platform data header for Samsung SoC MIPI-DSIM.
- *
- * Copyright (c) 2012 Samsung Electronics Co., Ltd
- *
- * InKi Dae <inki.dae@samsung.com>
- * Donghwa Lee <dh09.lee@samsung.com>
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License version 2 as
- * published by the Free Software Foundation.
-*/
-
-#ifndef _EXYNOS_MIPI_DSIM_H
-#define _EXYNOS_MIPI_DSIM_H
-
-#include <linux/device.h>
-#include <linux/fb.h>
-
-#define PANEL_NAME_SIZE		(32)
-
-/*
- * Enumerate display interface type.
- *
- * DSIM_COMMAND means cpu interface and rgb interface for DSIM_VIDEO.
- *
- * P.S. MIPI DSI Master has two display controller intefaces, RGB Interface
- *	for main display and CPU Interface(same as I80 Interface) for main
- *	and sub display.
- */
-enum mipi_dsim_interface_type {
-	DSIM_COMMAND,
-	DSIM_VIDEO
-};
-
-enum mipi_dsim_virtual_ch_no {
-	DSIM_VIRTUAL_CH_0,
-	DSIM_VIRTUAL_CH_1,
-	DSIM_VIRTUAL_CH_2,
-	DSIM_VIRTUAL_CH_3
-};
-
-enum mipi_dsim_burst_mode_type {
-	DSIM_NON_BURST_SYNC_EVENT,
-	DSIM_BURST_SYNC_EVENT,
-	DSIM_NON_BURST_SYNC_PULSE,
-	DSIM_BURST,
-	DSIM_NON_VIDEO_MODE
-};
-
-enum mipi_dsim_no_of_data_lane {
-	DSIM_DATA_LANE_1,
-	DSIM_DATA_LANE_2,
-	DSIM_DATA_LANE_3,
-	DSIM_DATA_LANE_4
-};
-
-enum mipi_dsim_byte_clk_src {
-	DSIM_PLL_OUT_DIV8,
-	DSIM_EXT_CLK_DIV8,
-	DSIM_EXT_CLK_BYPASS
-};
-
-enum mipi_dsim_pixel_format {
-	DSIM_CMD_3BPP,
-	DSIM_CMD_8BPP,
-	DSIM_CMD_12BPP,
-	DSIM_CMD_16BPP,
-	DSIM_VID_16BPP_565,
-	DSIM_VID_18BPP_666PACKED,
-	DSIM_18BPP_666LOOSELYPACKED,
-	DSIM_24BPP_888
-};
-
-/*
- * struct mipi_dsim_config - interface for configuring mipi-dsi controller.
- *
- * @auto_flush: enable or disable Auto flush of MD FIFO using VSYNC pulse.
- * @eot_disable: enable or disable EoT packet in HS mode.
- * @auto_vertical_cnt: specifies auto vertical count mode.
- *	in Video mode, the vertical line transition uses line counter
- *	configured by VSA, VBP, and Vertical resolution.
- *	If this bit is set to '1', the line counter does not use VSA and VBP
- *	registers.(in command mode, this variable is ignored)
- * @hse: set horizontal sync event mode.
- *	In VSYNC pulse and Vporch area, MIPI DSI master transfers only HSYNC
- *	start packet to MIPI DSI slave at MIPI DSI spec1.1r02.
- *	this bit transfers HSYNC end packet in VSYNC pulse and Vporch area
- *	(in mommand mode, this variable is ignored)
- * @hfp: specifies HFP disable mode.
- *	if this variable is set, DSI master ignores HFP area in VIDEO mode.
- *	(in command mode, this variable is ignored)
- * @hbp: specifies HBP disable mode.
- *	if this variable is set, DSI master ignores HBP area in VIDEO mode.
- *	(in command mode, this variable is ignored)
- * @hsa: specifies HSA disable mode.
- *	if this variable is set, DSI master ignores HSA area in VIDEO mode.
- *	(in command mode, this variable is ignored)
- * @cma_allow: specifies the number of horizontal lines, where command packet
- *	transmission is allowed after Stable VFP period.
- * @e_interface: specifies interface to be used.(CPU or RGB interface)
- * @e_virtual_ch: specifies virtual channel number that main or
- *	sub diaplsy uses.
- * @e_pixel_format: specifies pixel stream format for main or sub display.
- * @e_burst_mode: selects Burst mode in Video mode.
- *	in Non-burst mode, RGB data area is filled with RGB data and NULL
- *	packets, according to input bandwidth of RGB interface.
- *	In Burst mode, RGB data area is filled with RGB data only.
- * @e_no_data_lane: specifies data lane count to be used by Master.
- * @e_byte_clk: select byte clock source. (it must be DSIM_PLL_OUT_DIV8)
- *	DSIM_EXT_CLK_DIV8 and DSIM_EXT_CLK_BYPASSS are not supported.
- * @pll_stable_time: specifies the PLL Timer for stability of the ganerated
- *	clock(System clock cycle base)
- *	if the timer value goes to 0x00000000, the clock stable bit of status
- *	and interrupt register is set.
- * @esc_clk: specifies escape clock frequency for getting the escape clock
- *	prescaler value.
- * @stop_holding_cnt: specifies the interval value between transmitting
- *	read packet(or write "set_tear_on" command) and BTA request.
- *	after transmitting read packet or write "set_tear_on" command,
- *	BTA requests to D-PHY automatically. this counter value specifies
- *	the interval between them.
- * @bta_timeout: specifies the timer for BTA.
- *	this register specifies time out from BTA request to change
- *	the direction with respect to Tx escape clock.
- * @rx_timeout: specifies the timer for LP Rx mode timeout.
- *	this register specifies time out on how long RxValid deasserts,
- *	after RxLpdt asserts with respect to Tx escape clock.
- *	- RxValid specifies Rx data valid indicator.
- *	- RxLpdt specifies an indicator that D-PHY is under RxLpdt mode.
- *	- RxValid and RxLpdt specifies signal from D-PHY.
- */
-struct mipi_dsim_config {
-	unsigned char			auto_flush;
-	unsigned char			eot_disable;
-
-	unsigned char			auto_vertical_cnt;
-	unsigned char			hse;
-	unsigned char			hfp;
-	unsigned char			hbp;
-	unsigned char			hsa;
-	unsigned char			cmd_allow;
-
-	enum mipi_dsim_interface_type	e_interface;
-	enum mipi_dsim_virtual_ch_no	e_virtual_ch;
-	enum mipi_dsim_pixel_format	e_pixel_format;
-	enum mipi_dsim_burst_mode_type	e_burst_mode;
-	enum mipi_dsim_no_of_data_lane	e_no_data_lane;
-	enum mipi_dsim_byte_clk_src	e_byte_clk;
-
-	/*
-	 * ===========================================
-	 * |    P    |    M    |    S    |    MHz    |
-	 * -------------------------------------------
-	 * |    3    |   100   |    3    |    100    |
-	 * |    3    |   100   |    2    |    200    |
-	 * |    3    |    63   |    1    |    252    |
-	 * |    4    |   100   |    1    |    300    |
-	 * |    4    |   110   |    1    |    330    |
-	 * |   12    |   350   |    1    |    350    |
-	 * |    3    |   100   |    1    |    400    |
-	 * |    4    |   150   |    1    |    450    |
-	 * |    6    |   118   |    1    |    472    |
-	 * |	3    |   120   |    1    |    480    |
-	 * |   12    |   250   |    0    |    500    |
-	 * |    4    |   100   |    0    |    600    |
-	 * |    3    |    81   |    0    |    648    |
-	 * |    3    |    88   |    0    |    704    |
-	 * |    3    |    90   |    0    |    720    |
-	 * |    3    |   100   |    0    |    800    |
-	 * |   12    |   425   |    0    |    850    |
-	 * |    4    |   150   |    0    |    900    |
-	 * |   12    |   475   |    0    |    950    |
-	 * |    6    |   250   |    0    |   1000    |
-	 * -------------------------------------------
-	 */
-
-	/*
-	 * pms could be calculated as the following.
-	 * M * 24 / P * 2 ^ S = MHz
-	 */
-	unsigned char			p;
-	unsigned short			m;
-	unsigned char			s;
-
-	unsigned int			pll_stable_time;
-	unsigned long			esc_clk;
-
-	unsigned short			stop_holding_cnt;
-	unsigned char			bta_timeout;
-	unsigned short			rx_timeout;
-};
-
-/*
- * struct mipi_dsim_device - global interface for mipi-dsi driver.
- *
- * @dev: driver model representation of the device.
- * @id: unique device id.
- * @clock: pointer to MIPI-DSI clock of clock framework.
- * @irq: interrupt number to MIPI-DSI controller.
- * @reg_base: base address to memory mapped SRF of MIPI-DSI controller.
- *	(virtual address)
- * @lock: the mutex protecting this data structure.
- * @dsim_info: infomation for configuring mipi-dsi controller.
- * @master_ops: callbacks to mipi-dsi operations.
- * @dsim_lcd_dev: pointer to activated ddi device.
- *	(it would be registered by mipi-dsi driver.)
- * @dsim_lcd_drv: pointer to activated_ddi driver.
- *	(it would be registered by mipi-dsi driver.)
- * @lcd_info: pointer to mipi_lcd_info structure.
- * @state: specifies status of MIPI-DSI controller.
- *	the status could be RESET, INIT, STOP, HSCLKEN and ULPS.
- * @data_lane: specifiec enabled data lane number.
- *	this variable would be set by driver according to e_no_data_lane
- *	automatically.
- * @e_clk_src: select byte clock source.
- * @pd: pointer to MIPI-DSI driver platform data.
- * @phy: pointer to the MIPI-DSI PHY
- */
-struct mipi_dsim_device {
-	struct device			*dev;
-	int				id;
-	struct clk			*clock;
-	unsigned int			irq;
-	void __iomem			*reg_base;
-	struct mutex			lock;
-
-	struct mipi_dsim_config		*dsim_config;
-	struct mipi_dsim_master_ops	*master_ops;
-	struct mipi_dsim_lcd_device	*dsim_lcd_dev;
-	struct mipi_dsim_lcd_driver	*dsim_lcd_drv;
-
-	unsigned int			state;
-	unsigned int			data_lane;
-	unsigned int			e_clk_src;
-	bool				suspended;
-
-	struct mipi_dsim_platform_data	*pd;
-	struct phy			*phy;
-};
-
-/*
- * struct mipi_dsim_platform_data - interface to platform data
- *	for mipi-dsi driver.
- *
- * @lcd_panel_name: specifies lcd panel name registered to mipi-dsi driver.
- *	lcd panel driver searched would be actived.
- * @dsim_config: pointer of structure for configuring mipi-dsi controller.
- * @enabled: indicate whether mipi controller got enabled or not.
- * @lcd_panel_info: pointer for lcd panel specific structure.
- *	this structure specifies width, height, timing and polarity and so on.
- */
-struct mipi_dsim_platform_data {
-	char				lcd_panel_name[PANEL_NAME_SIZE];
-
-	struct mipi_dsim_config		*dsim_config;
-	unsigned int			enabled;
-	void				*lcd_panel_info;
-};
-
-/*
- * struct mipi_dsim_master_ops - callbacks to mipi-dsi operations.
- *
- * @cmd_write: transfer command to lcd panel at LP mode.
- * @cmd_read: read command from rx register.
- * @get_dsim_frame_done: get the status that all screen data have been
- *	transferred to mipi-dsi.
- * @clear_dsim_frame_done: clear frame done status.
- * @get_fb_frame_done: get frame done status of display controller.
- * @trigger: trigger display controller.
- *	- this one would be used only in case of CPU mode.
- *  @set_early_blank_mode: set framebuffer blank mode.
- *	- this callback should be called prior to fb_blank() by a client driver
- *	only if needing.
- *  @set_blank_mode: set framebuffer blank mode.
- *	- this callback should be called after fb_blank() by a client driver
- *	only if needing.
- */
-
-struct mipi_dsim_master_ops {
-	int (*cmd_write)(struct mipi_dsim_device *dsim, unsigned int data_id,
-		const unsigned char *data0, unsigned int data1);
-	int (*cmd_read)(struct mipi_dsim_device *dsim, unsigned int data_id,
-		unsigned int data0, unsigned int req_size, u8 *rx_buf);
-	int (*get_dsim_frame_done)(struct mipi_dsim_device *dsim);
-	int (*clear_dsim_frame_done)(struct mipi_dsim_device *dsim);
-
-	int (*get_fb_frame_done)(struct fb_info *info);
-	void (*trigger)(struct fb_info *info);
-	int (*set_early_blank_mode)(struct mipi_dsim_device *dsim, int power);
-	int (*set_blank_mode)(struct mipi_dsim_device *dsim, int power);
-};
-
-/*
- * device structure for mipi-dsi based lcd panel.
- *
- * @name: name of the device to use with this device, or an
- *	alias for that name.
- * @dev: driver model representation of the device.
- * @id: id of device to be registered.
- * @bus_id: bus id for identifing connected bus
- *	and this bus id should be same as id of mipi_dsim_device.
- * @irq: irq number for signaling when framebuffer transfer of
- *	lcd panel module is completed.
- *	this irq would be used only for MIPI-DSI based CPU mode lcd panel.
- * @master: pointer to mipi-dsi master device object.
- * @platform_data: lcd panel specific platform data.
- */
-struct mipi_dsim_lcd_device {
-	char			*name;
-	struct device		dev;
-	int			id;
-	int			bus_id;
-	int			irq;
-	int			panel_reverse;
-
-	struct mipi_dsim_device *master;
-	void			*platform_data;
-};
-
-/*
- * driver structure for mipi-dsi based lcd panel.
- *
- * this structure should be registered by lcd panel driver.
- * mipi-dsi driver seeks lcd panel registered through name field
- * and calls these callback functions in appropriate time.
- *
- * @name: name of the driver to use with this device, or an
- *	alias for that name.
- * @id: id of driver to be registered.
- *	this id would be used for finding device object registered.
- */
-struct mipi_dsim_lcd_driver {
-	char			*name;
-	int			id;
-
-	void	(*power_on)(struct mipi_dsim_lcd_device *dsim_dev, int enable);
-	void	(*set_sequence)(struct mipi_dsim_lcd_device *dsim_dev);
-	int	(*probe)(struct mipi_dsim_lcd_device *dsim_dev);
-	int	(*remove)(struct mipi_dsim_lcd_device *dsim_dev);
-	void	(*shutdown)(struct mipi_dsim_lcd_device *dsim_dev);
-	int	(*suspend)(struct mipi_dsim_lcd_device *dsim_dev);
-	int	(*resume)(struct mipi_dsim_lcd_device *dsim_dev);
-};
-
-/*
- * register mipi_dsim_lcd_device to mipi-dsi master.
- */
-int exynos_mipi_dsi_register_lcd_device(struct mipi_dsim_lcd_device
-						*lcd_dev);
-/**
- * register mipi_dsim_lcd_driver object defined by lcd panel driver
- * to mipi-dsi driver.
- */
-int exynos_mipi_dsi_register_lcd_driver(struct mipi_dsim_lcd_driver
-						*lcd_drv);
-#endif /* _EXYNOS_MIPI_DSIM_H */
diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile
index 992ab9d99f35..e57980845549 100644
--- a/kernel/trace/Makefile
+++ b/kernel/trace/Makefile
@@ -1,8 +1,4 @@
 
-# We are fully aware of the dangers of __builtin_return_address()
-FRAME_CFLAGS := $(call cc-disable-warning,frame-address)
-KBUILD_CFLAGS += $(FRAME_CFLAGS)
-
 # Do not instrument the tracer itself:
 
 ifdef CONFIG_FUNCTION_TRACER
diff --git a/mm/Makefile b/mm/Makefile
index 2ca1faf3fa09..295bd7a9f76b 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -21,9 +21,6 @@ KCOV_INSTRUMENT_memcontrol.o := n
 KCOV_INSTRUMENT_mmzone.o := n
 KCOV_INSTRUMENT_vmstat.o := n
 
-# Since __builtin_frame_address does work as used, disable the warning.
-CFLAGS_usercopy.o += $(call cc-disable-warning, frame-address)
-
 mmu-y			:= nommu.o
 mmu-$(CONFIG_MMU)	:= gup.o highmem.o memory.o mincore.o \
 			   mlock.o mmap.o mprotect.o mremap.o msync.o rmap.o \
diff --git a/net/bridge/br_sysfs_if.c b/net/bridge/br_sysfs_if.c
index e657258e1f2c..8bd569695e76 100644
--- a/net/bridge/br_sysfs_if.c
+++ b/net/bridge/br_sysfs_if.c
@@ -217,6 +217,7 @@ static const struct brport_attribute *brport_attrs[] = {
 #endif
 	&brport_attr_proxyarp,
 	&brport_attr_proxyarp_wifi,
+	&brport_attr_multicast_flood,
 	NULL
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index b06d2f46b83e..fb7348f13501 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -1144,6 +1144,8 @@ static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
 	if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi))
 		return 0;
 
+	memset(&vf_vlan_info, 0, sizeof(vf_vlan_info));
+
 	vf_mac.vf =
 		vf_vlan.vf =
 		vf_vlan_info.vf =
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f2be689a6c85..62d4d90c1389 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2265,7 +2265,8 @@ struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
 	if (err) {
 		res.fi = NULL;
 		res.table = NULL;
-		if (fl4->flowi4_oif) {
+		if (fl4->flowi4_oif &&
+		    !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
 			/* Apparently, routing tables are wrong. Assume,
 			   that the destination is on link.
 
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 54cf7197c7ab..5a27ab4eab39 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1190,6 +1190,16 @@ out:
 	return NULL;
 }
 
+static void tcp_v6_restore_cb(struct sk_buff *skb)
+{
+	/* We need to move header back to the beginning if xfrm6_policy_check()
+	 * and tcp_v6_fill_cb() are going to be called again.
+	 * ip6_datagram_recv_specific_ctl() also expects IP6CB to be there.
+	 */
+	memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
+		sizeof(struct inet6_skb_parm));
+}
+
 /* The socket must have it's spinlock held when we get
  * here, unless it is a TCP_LISTEN socket.
  *
@@ -1319,6 +1329,7 @@ ipv6_pktoptions:
 			np->flow_label = ip6_flowlabel(ipv6_hdr(opt_skb));
 		if (ipv6_opt_accepted(sk, opt_skb, &TCP_SKB_CB(opt_skb)->header.h6)) {
 			skb_set_owner_r(opt_skb, sk);
+			tcp_v6_restore_cb(opt_skb);
 			opt_skb = xchg(&np->pktoptions, opt_skb);
 		} else {
 			__kfree_skb(opt_skb);
@@ -1352,15 +1363,6 @@ static void tcp_v6_fill_cb(struct sk_buff *skb, const struct ipv6hdr *hdr,
 	TCP_SKB_CB(skb)->sacked = 0;
 }
 
-static void tcp_v6_restore_cb(struct sk_buff *skb)
-{
-	/* We need to move header back to the beginning if xfrm6_policy_check()
-	 * and tcp_v6_fill_cb() are going to be called again.
-	 */
-	memmove(IP6CB(skb), &TCP_SKB_CB(skb)->header.h6,
-		sizeof(struct inet6_skb_parm));
-}
-
 static int tcp_v6_rcv(struct sk_buff *skb)
 {
 	const struct tcphdr *th;
diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
index c8c82e109c68..22087062bd10 100644
--- a/net/openvswitch/flow.c
+++ b/net/openvswitch/flow.c
@@ -343,7 +343,7 @@ static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
 	key->eth.cvlan.tci = 0;
 	key->eth.cvlan.tpid = 0;
 
-	if (likely(skb_vlan_tag_present(skb))) {
+	if (skb_vlan_tag_present(skb)) {
 		key->eth.vlan.tci = htons(skb->vlan_tci);
 		key->eth.vlan.tpid = skb->vlan_proto;
 	} else {
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index 95c36147a6e1..e7da29021b38 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -176,7 +176,7 @@ static void do_setup(struct net_device *netdev)
 
 	netdev->vlan_features = netdev->features;
 	netdev->hw_enc_features = netdev->features;
-	netdev->features |= NETIF_F_HW_VLAN_CTAG_TX;
+	netdev->features |= NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_STAG_TX;
 	netdev->hw_features = netdev->features & ~NETIF_F_LLTX;
 
 	eth_hw_addr_random(netdev);
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 8f198437c724..7387418ac514 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -485,7 +485,8 @@ static unsigned int packet_length(const struct sk_buff *skb)
 {
 	unsigned int length = skb->len - ETH_HLEN;
 
-	if (skb_vlan_tagged(skb))
+	if (!skb_vlan_tag_present(skb) &&
+	    eth_type_vlan(skb->protocol))
 		length -= VLAN_HLEN;
 
 	/* Don't subtract for multiple VLAN tags. Most (all?) drivers allow
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index c9102172ce3b..a512b18c0088 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -341,22 +341,25 @@ int tcf_register_action(struct tc_action_ops *act,
 	if (!act->act || !act->dump || !act->init || !act->walk || !act->lookup)
 		return -EINVAL;
 
+	/* We have to register pernet ops before making the action ops visible,
+	 * otherwise tcf_action_init_1() could get a partially initialized
+	 * netns.
+	 */
+	ret = register_pernet_subsys(ops);
+	if (ret)
+		return ret;
+
 	write_lock(&act_mod_lock);
 	list_for_each_entry(a, &act_base, head) {
 		if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
 			write_unlock(&act_mod_lock);
+			unregister_pernet_subsys(ops);
 			return -EEXIST;
 		}
 	}
 	list_add_tail(&act->head, &act_base);
 	write_unlock(&act_mod_lock);
 
-	ret = register_pernet_subsys(ops);
-	if (ret) {
-		tcf_unregister_action(act, ops);
-		return ret;
-	}
-
 	return 0;
 }
 EXPORT_SYMBOL(tcf_register_action);
@@ -367,8 +370,6 @@ int tcf_unregister_action(struct tc_action_ops *act,
 	struct tc_action_ops *a;
 	int err = -ENOENT;
 
-	unregister_pernet_subsys(ops);
-
 	write_lock(&act_mod_lock);
 	list_for_each_entry(a, &act_base, head) {
 		if (a == act) {
@@ -378,6 +379,8 @@ int tcf_unregister_action(struct tc_action_ops *act,
 		}
 	}
 	write_unlock(&act_mod_lock);
+	if (!err)
+		unregister_pernet_subsys(ops);
 	return err;
 }
 EXPORT_SYMBOL(tcf_unregister_action);
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 11da7da0b7c4..2ee29a3375f6 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -101,7 +101,7 @@ EXPORT_SYMBOL(unregister_tcf_proto_ops);
 
 static int tfilter_notify(struct net *net, struct sk_buff *oskb,
 			  struct nlmsghdr *n, struct tcf_proto *tp,
-			  unsigned long fh, int event);
+			  unsigned long fh, int event, bool unicast);
 
 static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
 				 struct nlmsghdr *n,
@@ -112,7 +112,7 @@ static void tfilter_notify_chain(struct net *net, struct sk_buff *oskb,
 
 	for (it_chain = chain; (tp = rtnl_dereference(*it_chain)) != NULL;
 	     it_chain = &tp->next)
-		tfilter_notify(net, oskb, n, tp, 0, event);
+		tfilter_notify(net, oskb, n, tp, 0, event, false);
 }
 
 /* Select new prio value from the range, managed by kernel. */
@@ -319,7 +319,8 @@ replay:
 
 			RCU_INIT_POINTER(*back, next);
 
-			tfilter_notify(net, skb, n, tp, fh, RTM_DELTFILTER);
+			tfilter_notify(net, skb, n, tp, fh,
+				       RTM_DELTFILTER, false);
 			tcf_destroy(tp, true);
 			err = 0;
 			goto errout;
@@ -345,14 +346,14 @@ replay:
 				struct tcf_proto *next = rtnl_dereference(tp->next);
 
 				tfilter_notify(net, skb, n, tp, fh,
-					       RTM_DELTFILTER);
+					       RTM_DELTFILTER, false);
 				if (tcf_destroy(tp, false))
 					RCU_INIT_POINTER(*back, next);
 			}
 			goto errout;
 		case RTM_GETTFILTER:
 			err = tfilter_notify(net, skb, n, tp, fh,
-					     RTM_NEWTFILTER);
+					     RTM_NEWTFILTER, true);
 			goto errout;
 		default:
 			err = -EINVAL;
@@ -367,7 +368,7 @@ replay:
 			RCU_INIT_POINTER(tp->next, rtnl_dereference(*back));
 			rcu_assign_pointer(*back, tp);
 		}
-		tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER);
+		tfilter_notify(net, skb, n, tp, fh, RTM_NEWTFILTER, false);
 	} else {
 		if (tp_created)
 			tcf_destroy(tp, true);
@@ -419,7 +420,7 @@ nla_put_failure:
 
 static int tfilter_notify(struct net *net, struct sk_buff *oskb,
 			  struct nlmsghdr *n, struct tcf_proto *tp,
-			  unsigned long fh, int event)
+			  unsigned long fh, int event, bool unicast)
 {
 	struct sk_buff *skb;
 	u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
@@ -433,6 +434,9 @@ static int tfilter_notify(struct net *net, struct sk_buff *oskb,
 		return -EINVAL;
 	}
 
+	if (unicast)
+		return netlink_unicast(net->rtnl, skb, portid, MSG_DONTWAIT);
+
 	return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
 			      n->nlmsg_flags & NLM_F_ECHO);
 }
diff --git a/net/strparser/strparser.c b/net/strparser/strparser.c
index 5c7549b5b92c..41adf362936d 100644
--- a/net/strparser/strparser.c
+++ b/net/strparser/strparser.c
@@ -246,7 +246,7 @@ static int strp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
 				} else {
 					strp->rx_interrupted = 1;
 				}
-				strp_parser_err(strp, err, desc);
+				strp_parser_err(strp, len, desc);
 				break;
 			} else if (len > strp->sk->sk_rcvbuf) {
 				/* Message length exceeds maximum allowed */
diff --git a/net/sunrpc/auth.c b/net/sunrpc/auth.c
index a7e42f9a405c..2bff63a73cf8 100644
--- a/net/sunrpc/auth.c
+++ b/net/sunrpc/auth.c
@@ -551,7 +551,7 @@ rpcauth_lookup_credcache(struct rpc_auth *auth, struct auth_cred * acred,
 			*entry, *new;
 	unsigned int nr;
 
-	nr = hash_long(from_kuid(&init_user_ns, acred->uid), cache->hashbits);
+	nr = auth->au_ops->hash_cred(acred, cache->hashbits);
 
 	rcu_read_lock();
 	hlist_for_each_entry_rcu(entry, &cache->hashtable[nr], cr_hash) {
diff --git a/net/sunrpc/auth_generic.c b/net/sunrpc/auth_generic.c
index 83dffeadf20a..f1df9837f1ac 100644
--- a/net/sunrpc/auth_generic.c
+++ b/net/sunrpc/auth_generic.c
@@ -78,6 +78,14 @@ static struct rpc_cred *generic_bind_cred(struct rpc_task *task,
 	return auth->au_ops->lookup_cred(auth, acred, lookupflags);
 }
 
+static int
+generic_hash_cred(struct auth_cred *acred, unsigned int hashbits)
+{
+	return hash_64(from_kgid(&init_user_ns, acred->gid) |
+		((u64)from_kuid(&init_user_ns, acred->uid) <<
+			(sizeof(gid_t) * 8)), hashbits);
+}
+
 /*
  * Lookup generic creds for current process
  */
@@ -258,6 +266,7 @@ generic_key_timeout(struct rpc_auth *auth, struct rpc_cred *cred)
 static const struct rpc_authops generic_auth_ops = {
 	.owner = THIS_MODULE,
 	.au_name = "Generic",
+	.hash_cred = generic_hash_cred,
 	.lookup_cred = generic_lookup_cred,
 	.crcreate = generic_create_cred,
 	.key_timeout = generic_key_timeout,
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index 976c7812bbd5..d8bd97a5a7c9 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -1298,6 +1298,12 @@ gss_destroy_cred(struct rpc_cred *cred)
 	gss_destroy_nullcred(cred);
 }
 
+static int
+gss_hash_cred(struct auth_cred *acred, unsigned int hashbits)
+{
+	return hash_64(from_kuid(&init_user_ns, acred->uid), hashbits);
+}
+
 /*
  * Lookup RPCSEC_GSS cred for the current process
  */
@@ -1982,6 +1988,7 @@ static const struct rpc_authops authgss_ops = {
 	.au_name	= "RPCSEC_GSS",
 	.create		= gss_create,
 	.destroy	= gss_destroy,
+	.hash_cred	= gss_hash_cred,
 	.lookup_cred	= gss_lookup_cred,
 	.crcreate	= gss_create_cred,
 	.list_pseudoflavors = gss_mech_list_pseudoflavors,
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index a1d768a973f5..306fc0f54596 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -46,6 +46,14 @@ unx_destroy(struct rpc_auth *auth)
 	rpcauth_clear_credcache(auth->au_credcache);
 }
 
+static int
+unx_hash_cred(struct auth_cred *acred, unsigned int hashbits)
+{
+	return hash_64(from_kgid(&init_user_ns, acred->gid) |
+		((u64)from_kuid(&init_user_ns, acred->uid) <<
+			(sizeof(gid_t) * 8)), hashbits);
+}
+
 /*
  * Lookup AUTH_UNIX creds for current process
  */
@@ -220,6 +228,7 @@ const struct rpc_authops authunix_ops = {
 	.au_name	= "UNIX",
 	.create		= unx_create,
 	.destroy	= unx_destroy,
+	.hash_cred	= unx_hash_cred,
 	.lookup_cred	= unx_lookup_cred,
 	.crcreate	= unx_create_cred,
 };
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 229956bf8457..ac701c28f44f 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -76,13 +76,7 @@ static int xprt_alloc_xdr_buf(struct xdr_buf *buf, gfp_t gfp_flags)
 	page = alloc_page(gfp_flags);
 	if (page == NULL)
 		return -ENOMEM;
-	buf->head[0].iov_base = page_address(page);
-	buf->head[0].iov_len = PAGE_SIZE;
-	buf->tail[0].iov_base = NULL;
-	buf->tail[0].iov_len = 0;
-	buf->page_len = 0;
-	buf->len = 0;
-	buf->buflen = PAGE_SIZE;
+	xdr_buf_init(buf, page_address(page), PAGE_SIZE);
 	return 0;
 }
 
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 4d8e11f94a35..8aabe12201f8 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -353,7 +353,7 @@ void sunrpc_init_cache_detail(struct cache_detail *cd)
 	spin_unlock(&cache_list_lock);
 
 	/* start the cleaning process */
-	schedule_delayed_work(&cache_cleaner, 0);
+	queue_delayed_work(system_power_efficient_wq, &cache_cleaner, 0);
 }
 EXPORT_SYMBOL_GPL(sunrpc_init_cache_detail);
 
@@ -476,7 +476,8 @@ static void do_cache_clean(struct work_struct *work)
 		delay = 0;
 
 	if (delay)
-		schedule_delayed_work(&cache_cleaner, delay);
+		queue_delayed_work(system_power_efficient_wq,
+				   &cache_cleaner, delay);
 }
 
 
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 66f23b376fa0..34dd7b26ee5f 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -184,7 +184,6 @@ static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event,
 				   struct super_block *sb)
 {
 	struct dentry *dentry;
-	int err = 0;
 
 	switch (event) {
 	case RPC_PIPEFS_MOUNT:
@@ -201,7 +200,7 @@ static int __rpc_clnt_handle_event(struct rpc_clnt *clnt, unsigned long event,
 		printk(KERN_ERR "%s: unknown event: %ld\n", __func__, event);
 		return -ENOTSUPP;
 	}
-	return err;
+	return 0;
 }
 
 static int __rpc_pipefs_event(struct rpc_clnt *clnt, unsigned long event,
@@ -988,7 +987,6 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
 {
 
 	if (clnt != NULL) {
-		rpc_task_release_client(task);
 		if (task->tk_xprt == NULL)
 			task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi);
 		task->tk_client = clnt;
@@ -1693,6 +1691,7 @@ call_allocate(struct rpc_task *task)
 	struct rpc_rqst *req = task->tk_rqstp;
 	struct rpc_xprt *xprt = req->rq_xprt;
 	struct rpc_procinfo *proc = task->tk_msg.rpc_proc;
+	int status;
 
 	dprint_status(task);
 
@@ -1718,11 +1717,14 @@ call_allocate(struct rpc_task *task)
 	req->rq_rcvsize = RPC_REPHDRSIZE + slack + proc->p_replen;
 	req->rq_rcvsize <<= 2;
 
-	req->rq_buffer = xprt->ops->buf_alloc(task,
-					req->rq_callsize + req->rq_rcvsize);
-	if (req->rq_buffer != NULL)
-		return;
+	status = xprt->ops->buf_alloc(task);
 	xprt_inject_disconnect(xprt);
+	if (status == 0)
+		return;
+	if (status != -ENOMEM) {
+		rpc_exit(task, status);
+		return;
+	}
 
 	dprintk("RPC: %5u rpc_buffer allocation failed\n", task->tk_pid);
 
@@ -1748,18 +1750,6 @@ rpc_task_force_reencode(struct rpc_task *task)
 	task->tk_rqstp->rq_bytes_sent = 0;
 }
 
-static inline void
-rpc_xdr_buf_init(struct xdr_buf *buf, void *start, size_t len)
-{
-	buf->head[0].iov_base = start;
-	buf->head[0].iov_len = len;
-	buf->tail[0].iov_len = 0;
-	buf->page_len = 0;
-	buf->flags = 0;
-	buf->len = 0;
-	buf->buflen = len;
-}
-
 /*
  * 3.	Encode arguments of an RPC call
  */
@@ -1772,12 +1762,12 @@ rpc_xdr_encode(struct rpc_task *task)
 
 	dprint_status(task);
 
-	rpc_xdr_buf_init(&req->rq_snd_buf,
-			 req->rq_buffer,
-			 req->rq_callsize);
-	rpc_xdr_buf_init(&req->rq_rcv_buf,
-			 (char *)req->rq_buffer + req->rq_callsize,
-			 req->rq_rcvsize);
+	xdr_buf_init(&req->rq_snd_buf,
+		     req->rq_buffer,
+		     req->rq_callsize);
+	xdr_buf_init(&req->rq_rcv_buf,
+		     req->rq_rbuffer,
+		     req->rq_rcvsize);
 
 	p = rpc_encode_header(task);
 	if (p == NULL) {
@@ -2616,6 +2606,70 @@ int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
 EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt);
 
 /**
+ * rpc_clnt_setup_test_and_add_xprt()
+ *
+ * This is an rpc_clnt_add_xprt setup() function which returns 1 so:
+ *   1) caller of the test function must dereference the rpc_xprt_switch
+ *   and the rpc_xprt.
+ *   2) test function must call rpc_xprt_switch_add_xprt, usually in
+ *   the rpc_call_done routine.
+ *
+ * Upon success (return of 1), the test function adds the new
+ * transport to the rpc_clnt xprt switch
+ *
+ * @clnt: struct rpc_clnt to get the new transport
+ * @xps:  the rpc_xprt_switch to hold the new transport
+ * @xprt: the rpc_xprt to test
+ * @data: a struct rpc_add_xprt_test pointer that holds the test function
+ *        and test function call data
+ */
+int rpc_clnt_setup_test_and_add_xprt(struct rpc_clnt *clnt,
+				     struct rpc_xprt_switch *xps,
+				     struct rpc_xprt *xprt,
+				     void *data)
+{
+	struct rpc_cred *cred;
+	struct rpc_task *task;
+	struct rpc_add_xprt_test *xtest = (struct rpc_add_xprt_test *)data;
+	int status = -EADDRINUSE;
+
+	xprt = xprt_get(xprt);
+	xprt_switch_get(xps);
+
+	if (rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr))
+		goto out_err;
+
+	/* Test the connection */
+	cred = authnull_ops.lookup_cred(NULL, NULL, 0);
+	task = rpc_call_null_helper(clnt, xprt, cred,
+				    RPC_TASK_SOFT | RPC_TASK_SOFTCONN,
+				    NULL, NULL);
+	put_rpccred(cred);
+	if (IS_ERR(task)) {
+		status = PTR_ERR(task);
+		goto out_err;
+	}
+	status = task->tk_status;
+	rpc_put_task(task);
+
+	if (status < 0)
+		goto out_err;
+
+	/* rpc_xprt_switch and rpc_xprt are deferrenced by add_xprt_test() */
+	xtest->add_xprt_test(clnt, xprt, xtest->data);
+
+	/* so that rpc_clnt_add_xprt does not call rpc_xprt_switch_add_xprt */
+	return 1;
+out_err:
+	xprt_put(xprt);
+	xprt_switch_put(xps);
+	pr_info("RPC:   rpc_clnt_test_xprt failed: %d addr %s not added\n",
+		status, xprt->address_strings[RPC_DISPLAY_ADDR]);
+	return status;
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_setup_test_and_add_xprt);
+
+/**
  * rpc_clnt_add_xprt - Add a new transport to a rpc_clnt
  * @clnt: pointer to struct rpc_clnt
  * @xprtargs: pointer to struct xprt_create
@@ -2697,6 +2751,34 @@ rpc_cap_max_reconnect_timeout(struct rpc_clnt *clnt, unsigned long timeo)
 }
 EXPORT_SYMBOL_GPL(rpc_cap_max_reconnect_timeout);
 
+void rpc_clnt_xprt_switch_put(struct rpc_clnt *clnt)
+{
+	xprt_switch_put(rcu_dereference(clnt->cl_xpi.xpi_xpswitch));
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_put);
+
+void rpc_clnt_xprt_switch_add_xprt(struct rpc_clnt *clnt, struct rpc_xprt *xprt)
+{
+	rpc_xprt_switch_add_xprt(rcu_dereference(clnt->cl_xpi.xpi_xpswitch),
+				 xprt);
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_add_xprt);
+
+bool rpc_clnt_xprt_switch_has_addr(struct rpc_clnt *clnt,
+				   const struct sockaddr *sap)
+{
+	struct rpc_xprt_switch *xps;
+	bool ret;
+
+	xps = rcu_dereference(clnt->cl_xpi.xpi_xpswitch);
+
+	rcu_read_lock();
+	ret = rpc_xprt_switch_has_addr(xps, sap);
+	rcu_read_unlock();
+	return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_xprt_switch_has_addr);
+
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
 static void rpc_show_header(void)
 {
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 9ae588511aaf..5db68b371db2 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -849,14 +849,17 @@ static void rpc_async_schedule(struct work_struct *work)
 }
 
 /**
- * rpc_malloc - allocate an RPC buffer
- * @task: RPC task that will use this buffer
- * @size: requested byte size
+ * rpc_malloc - allocate RPC buffer resources
+ * @task: RPC task
+ *
+ * A single memory region is allocated, which is split between the
+ * RPC call and RPC reply that this task is being used for. When
+ * this RPC is retired, the memory is released by calling rpc_free.
  *
  * To prevent rpciod from hanging, this allocator never sleeps,
- * returning NULL and suppressing warning if the request cannot be serviced
- * immediately.
- * The caller can arrange to sleep in a way that is safe for rpciod.
+ * returning -ENOMEM and suppressing warning if the request cannot
+ * be serviced immediately. The caller can arrange to sleep in a
+ * way that is safe for rpciod.
  *
  * Most requests are 'small' (under 2KiB) and can be serviced from a
  * mempool, ensuring that NFS reads and writes can always proceed,
@@ -865,8 +868,10 @@ static void rpc_async_schedule(struct work_struct *work)
  * In order to avoid memory starvation triggering more writebacks of
  * NFS requests, we avoid using GFP_KERNEL.
  */
-void *rpc_malloc(struct rpc_task *task, size_t size)
+int rpc_malloc(struct rpc_task *task)
 {
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	size_t size = rqst->rq_callsize + rqst->rq_rcvsize;
 	struct rpc_buffer *buf;
 	gfp_t gfp = GFP_NOIO | __GFP_NOWARN;
 
@@ -880,28 +885,28 @@ void *rpc_malloc(struct rpc_task *task, size_t size)
 		buf = kmalloc(size, gfp);
 
 	if (!buf)
-		return NULL;
+		return -ENOMEM;
 
 	buf->len = size;
 	dprintk("RPC: %5u allocated buffer of size %zu at %p\n",
 			task->tk_pid, size, buf);
-	return &buf->data;
+	rqst->rq_buffer = buf->data;
+	rqst->rq_rbuffer = (char *)rqst->rq_buffer + rqst->rq_callsize;
+	return 0;
 }
 EXPORT_SYMBOL_GPL(rpc_malloc);
 
 /**
- * rpc_free - free buffer allocated via rpc_malloc
- * @buffer: buffer to free
+ * rpc_free - free RPC buffer resources allocated via rpc_malloc
+ * @task: RPC task
  *
  */
-void rpc_free(void *buffer)
+void rpc_free(struct rpc_task *task)
 {
+	void *buffer = task->tk_rqstp->rq_buffer;
 	size_t size;
 	struct rpc_buffer *buf;
 
-	if (!buffer)
-		return;
-
 	buf = container_of(buffer, struct rpc_buffer, data);
 	size = buf->len;
 
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index c5b0cb4f4056..7c8070ec93c8 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -401,6 +401,21 @@ int svc_bind(struct svc_serv *serv, struct net *net)
 }
 EXPORT_SYMBOL_GPL(svc_bind);
 
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+static void
+__svc_init_bc(struct svc_serv *serv)
+{
+	INIT_LIST_HEAD(&serv->sv_cb_list);
+	spin_lock_init(&serv->sv_cb_lock);
+	init_waitqueue_head(&serv->sv_cb_waitq);
+}
+#else
+static void
+__svc_init_bc(struct svc_serv *serv)
+{
+}
+#endif
+
 /*
  * Create an RPC service
  */
@@ -443,6 +458,8 @@ __svc_create(struct svc_program *prog, unsigned int bufsize, int npools,
 	init_timer(&serv->sv_temptimer);
 	spin_lock_init(&serv->sv_lock);
 
+	__svc_init_bc(serv);
+
 	serv->sv_nrpools = npools;
 	serv->sv_pools =
 		kcalloc(serv->sv_nrpools, sizeof(struct svc_pool),
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index c4f3cc0c0775..7f1071e103ca 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -767,7 +767,7 @@ static void xdr_set_next_page(struct xdr_stream *xdr)
 	newbase -= xdr->buf->page_base;
 
 	if (xdr_set_page_base(xdr, newbase, PAGE_SIZE) < 0)
-		xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len);
+		xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
 }
 
 static bool xdr_set_next_buffer(struct xdr_stream *xdr)
@@ -776,7 +776,7 @@ static bool xdr_set_next_buffer(struct xdr_stream *xdr)
 		xdr_set_next_page(xdr);
 	else if (xdr->iov == xdr->buf->head) {
 		if (xdr_set_page_base(xdr, 0, PAGE_SIZE) < 0)
-			xdr_set_iov(xdr, xdr->buf->tail, xdr->buf->len);
+			xdr_set_iov(xdr, xdr->buf->tail, xdr->nwords << 2);
 	}
 	return xdr->p != xdr->end;
 }
@@ -859,12 +859,15 @@ EXPORT_SYMBOL_GPL(xdr_set_scratch_buffer);
 static __be32 *xdr_copy_to_scratch(struct xdr_stream *xdr, size_t nbytes)
 {
 	__be32 *p;
-	void *cpdest = xdr->scratch.iov_base;
+	char *cpdest = xdr->scratch.iov_base;
 	size_t cplen = (char *)xdr->end - (char *)xdr->p;
 
 	if (nbytes > xdr->scratch.iov_len)
 		return NULL;
-	memcpy(cpdest, xdr->p, cplen);
+	p = __xdr_inline_decode(xdr, cplen);
+	if (p == NULL)
+		return NULL;
+	memcpy(cpdest, p, cplen);
 	cpdest += cplen;
 	nbytes -= cplen;
 	if (!xdr_set_next_buffer(xdr))
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index ea244b29138b..685e6d225414 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -1295,7 +1295,7 @@ void xprt_release(struct rpc_task *task)
 	xprt_schedule_autodisconnect(xprt);
 	spin_unlock_bh(&xprt->transport_lock);
 	if (req->rq_buffer)
-		xprt->ops->buf_free(req->rq_buffer);
+		xprt->ops->buf_free(task);
 	xprt_inject_disconnect(xprt);
 	if (req->rq_cred != NULL)
 		put_rpccred(req->rq_cred);
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
index 66c9d63f4797..ae92a9e9ba52 100644
--- a/net/sunrpc/xprtmultipath.c
+++ b/net/sunrpc/xprtmultipath.c
@@ -15,6 +15,7 @@
 #include <asm/cmpxchg.h>
 #include <linux/spinlock.h>
 #include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/addr.h>
 #include <linux/sunrpc/xprtmultipath.h>
 
 typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct list_head *head,
@@ -49,7 +50,8 @@ void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps,
 	if (xprt == NULL)
 		return;
 	spin_lock(&xps->xps_lock);
-	if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL)
+	if ((xps->xps_net == xprt->xprt_net || xps->xps_net == NULL) &&
+	    !rpc_xprt_switch_has_addr(xps, (struct sockaddr *)&xprt->addr))
 		xprt_switch_add_xprt_locked(xps, xprt);
 	spin_unlock(&xps->xps_lock);
 }
@@ -232,6 +234,26 @@ struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi)
 	return xprt_switch_find_current_entry(head, xpi->xpi_cursor);
 }
 
+bool rpc_xprt_switch_has_addr(struct rpc_xprt_switch *xps,
+			      const struct sockaddr *sap)
+{
+	struct list_head *head;
+	struct rpc_xprt *pos;
+
+	if (xps == NULL || sap == NULL)
+		return false;
+
+	head = &xps->xps_xprt_list;
+	list_for_each_entry_rcu(pos, head, xprt_switch) {
+		if (rpc_cmp_addr_port(sap, (struct sockaddr *)&pos->addr)) {
+			pr_info("RPC:   addr %s already in xprt switch\n",
+				pos->address_strings[RPC_DISPLAY_ADDR]);
+			return true;
+		}
+	}
+	return false;
+}
+
 static
 struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head,
 		const struct rpc_xprt *cur)
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 87762d976b63..2c472e1b4827 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -27,7 +27,7 @@ static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt,
 	list_del(&req->rl_all);
 	spin_unlock(&buf->rb_reqslock);
 
-	rpcrdma_destroy_req(&r_xprt->rx_ia, req);
+	rpcrdma_destroy_req(req);
 
 	kfree(rqst);
 }
@@ -35,10 +35,8 @@ static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt,
 static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
 				 struct rpc_rqst *rqst)
 {
-	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct rpcrdma_regbuf *rb;
 	struct rpcrdma_req *req;
-	struct xdr_buf *buf;
 	size_t size;
 
 	req = rpcrdma_create_req(r_xprt);
@@ -46,30 +44,19 @@ static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
 		return PTR_ERR(req);
 	req->rl_backchannel = true;
 
-	size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
-	rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL);
+	rb = rpcrdma_alloc_regbuf(RPCRDMA_HDRBUF_SIZE,
+				  DMA_TO_DEVICE, GFP_KERNEL);
 	if (IS_ERR(rb))
 		goto out_fail;
 	req->rl_rdmabuf = rb;
 
-	size += RPCRDMA_INLINE_READ_THRESHOLD(rqst);
-	rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL);
+	size = r_xprt->rx_data.inline_rsize;
+	rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, GFP_KERNEL);
 	if (IS_ERR(rb))
 		goto out_fail;
-	rb->rg_owner = req;
 	req->rl_sendbuf = rb;
-	/* so that rpcr_to_rdmar works when receiving a request */
-	rqst->rq_buffer = (void *)req->rl_sendbuf->rg_base;
-
-	buf = &rqst->rq_snd_buf;
-	buf->head[0].iov_base = rqst->rq_buffer;
-	buf->head[0].iov_len = 0;
-	buf->tail[0].iov_base = NULL;
-	buf->tail[0].iov_len = 0;
-	buf->page_len = 0;
-	buf->len = 0;
-	buf->buflen = size;
-
+	xdr_buf_init(&rqst->rq_snd_buf, rb->rg_base, size);
+	rpcrdma_set_xprtdata(rqst, req);
 	return 0;
 
 out_fail:
@@ -219,7 +206,6 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
 	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
 	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
 	struct rpcrdma_msg *headerp;
-	size_t rpclen;
 
 	headerp = rdmab_to_msg(req->rl_rdmabuf);
 	headerp->rm_xid = rqst->rq_xid;
@@ -231,26 +217,9 @@ int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
 	headerp->rm_body.rm_chunks[1] = xdr_zero;
 	headerp->rm_body.rm_chunks[2] = xdr_zero;
 
-	rpclen = rqst->rq_svec[0].iov_len;
-
-#ifdef RPCRDMA_BACKCHANNEL_DEBUG
-	pr_info("RPC:       %s: rpclen %zd headerp 0x%p lkey 0x%x\n",
-		__func__, rpclen, headerp, rdmab_lkey(req->rl_rdmabuf));
-	pr_info("RPC:       %s: RPC/RDMA: %*ph\n",
-		__func__, (int)RPCRDMA_HDRLEN_MIN, headerp);
-	pr_info("RPC:       %s:      RPC: %*ph\n",
-		__func__, (int)rpclen, rqst->rq_svec[0].iov_base);
-#endif
-
-	req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
-	req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN;
-	req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
-
-	req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
-	req->rl_send_iov[1].length = rpclen;
-	req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
-
-	req->rl_niovs = 2;
+	if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, RPCRDMA_HDRLEN_MIN,
+				       &rqst->rq_snd_buf, rpcrdma_noch))
+		return -EIO;
 	return 0;
 }
 
@@ -402,7 +371,7 @@ out_overflow:
 out_short:
 	pr_warn("RPC/RDMA short backward direction call\n");
 
-	if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+	if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
 		xprt_disconnect_done(xprt);
 	else
 		pr_warn("RPC:       %s: reposting rep %p\n",
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index 21cb3b150b37..1ebb09e1ac4f 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -160,9 +160,8 @@ static int
 fmr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
 	    struct rpcrdma_create_data_internal *cdata)
 {
-	rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
-						      RPCRDMA_MAX_DATA_SEGS /
-						      RPCRDMA_MAX_FMR_SGES));
+	ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
+				RPCRDMA_MAX_FMR_SGES);
 	return 0;
 }
 
@@ -274,6 +273,7 @@ fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	 */
 	list_for_each_entry(mw, &req->rl_registered, mw_list)
 		list_add_tail(&mw->fmr.fm_mr->list, &unmap_list);
+	r_xprt->rx_stats.local_inv_needed++;
 	rc = ib_unmap_fmr(&unmap_list);
 	if (rc)
 		goto out_reset;
@@ -331,4 +331,5 @@ const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
 	.ro_init_mr			= fmr_op_init_mr,
 	.ro_release_mr			= fmr_op_release_mr,
 	.ro_displayname			= "fmr",
+	.ro_send_w_inv_ok		= 0,
 };
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 892b5e1d9b09..210949562786 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -67,6 +67,8 @@
  * pending send queue WRs before the transport is reconnected.
  */
 
+#include <linux/sunrpc/rpc_rdma.h>
+
 #include "xprt_rdma.h"
 
 #if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -161,7 +163,7 @@ __frwr_reset_mr(struct rpcrdma_ia *ia, struct rpcrdma_mw *r)
 		return PTR_ERR(f->fr_mr);
 	}
 
-	dprintk("RPC:       %s: recovered FRMR %p\n", __func__, r);
+	dprintk("RPC:       %s: recovered FRMR %p\n", __func__, f);
 	f->fr_state = FRMR_IS_INVALID;
 	return 0;
 }
@@ -242,9 +244,8 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
 					       depth;
 	}
 
-	rpcrdma_set_max_header_sizes(ia, cdata, max_t(unsigned int, 1,
-						      RPCRDMA_MAX_DATA_SEGS /
-						      ia->ri_max_frmr_depth));
+	ia->ri_max_segs = max_t(unsigned int, 1, RPCRDMA_MAX_DATA_SEGS /
+				ia->ri_max_frmr_depth);
 	return 0;
 }
 
@@ -329,7 +330,7 @@ frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
 	frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
 	if (wc->status != IB_WC_SUCCESS)
 		__frwr_sendcompletion_flush(wc, frmr, "localinv");
-	complete_all(&frmr->fr_linv_done);
+	complete(&frmr->fr_linv_done);
 }
 
 /* Post a REG_MR Work Request to register a memory region
@@ -396,7 +397,7 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
 		goto out_mapmr_err;
 
 	dprintk("RPC:       %s: Using frmr %p to map %u segments (%u bytes)\n",
-		__func__, mw, mw->mw_nents, mr->length);
+		__func__, frmr, mw->mw_nents, mr->length);
 
 	key = (u8)(mr->rkey & 0x000000FF);
 	ib_update_fast_reg_key(mr, ++key);
@@ -449,6 +450,8 @@ __frwr_prepare_linv_wr(struct rpcrdma_mw *mw)
 	struct rpcrdma_frmr *f = &mw->frmr;
 	struct ib_send_wr *invalidate_wr;
 
+	dprintk("RPC:       %s: invalidating frmr %p\n", __func__, f);
+
 	f->fr_state = FRMR_IS_INVALID;
 	invalidate_wr = &f->fr_invwr;
 
@@ -472,6 +475,7 @@ static void
 frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 {
 	struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr;
+	struct rpcrdma_rep *rep = req->rl_reply;
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 	struct rpcrdma_mw *mw, *tmp;
 	struct rpcrdma_frmr *f;
@@ -487,6 +491,12 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	f = NULL;
 	invalidate_wrs = pos = prev = NULL;
 	list_for_each_entry(mw, &req->rl_registered, mw_list) {
+		if ((rep->rr_wc_flags & IB_WC_WITH_INVALIDATE) &&
+		    (mw->mw_handle == rep->rr_inv_rkey)) {
+			mw->frmr.fr_state = FRMR_IS_INVALID;
+			continue;
+		}
+
 		pos = __frwr_prepare_linv_wr(mw);
 
 		if (!invalidate_wrs)
@@ -496,6 +506,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 		prev = pos;
 		f = &mw->frmr;
 	}
+	if (!f)
+		goto unmap;
 
 	/* Strong send queue ordering guarantees that when the
 	 * last WR in the chain completes, all WRs in the chain
@@ -510,6 +522,7 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	 * replaces the QP. The RPC reply handler won't call us
 	 * unless ri_id->qp is a valid pointer.
 	 */
+	r_xprt->rx_stats.local_inv_needed++;
 	rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr);
 	if (rc)
 		goto reset_mrs;
@@ -521,6 +534,8 @@ frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
 	 */
 unmap:
 	list_for_each_entry_safe(mw, tmp, &req->rl_registered, mw_list) {
+		dprintk("RPC:       %s: unmapping frmr %p\n",
+			__func__, &mw->frmr);
 		list_del_init(&mw->mw_list);
 		ib_dma_unmap_sg(ia->ri_device,
 				mw->mw_sg, mw->mw_nents, mw->mw_dir);
@@ -576,4 +591,5 @@ const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
 	.ro_init_mr			= frwr_op_init_mr,
 	.ro_release_mr			= frwr_op_release_mr,
 	.ro_displayname			= "frwr",
+	.ro_send_w_inv_ok		= RPCRDMA_CMP_F_SND_W_INV_OK,
 };
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index a47f170b20ef..d987c2d3dd6e 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -53,14 +53,6 @@
 # define RPCDBG_FACILITY	RPCDBG_TRANS
 #endif
 
-enum rpcrdma_chunktype {
-	rpcrdma_noch = 0,
-	rpcrdma_readch,
-	rpcrdma_areadch,
-	rpcrdma_writech,
-	rpcrdma_replych
-};
-
 static const char transfertypes[][12] = {
 	"inline",	/* no chunks */
 	"read list",	/* some argument via rdma read */
@@ -118,10 +110,12 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
 	return size;
 }
 
-void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *ia,
-				  struct rpcrdma_create_data_internal *cdata,
-				  unsigned int maxsegs)
+void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
 {
+	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+	unsigned int maxsegs = ia->ri_max_segs;
+
 	ia->ri_max_inline_write = cdata->inline_wsize -
 				  rpcrdma_max_call_header_size(maxsegs);
 	ia->ri_max_inline_read = cdata->inline_rsize -
@@ -155,42 +149,6 @@ static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
 	return rqst->rq_rcv_buf.buflen <= ia->ri_max_inline_read;
 }
 
-static int
-rpcrdma_tail_pullup(struct xdr_buf *buf)
-{
-	size_t tlen = buf->tail[0].iov_len;
-	size_t skip = tlen & 3;
-
-	/* Do not include the tail if it is only an XDR pad */
-	if (tlen < 4)
-		return 0;
-
-	/* xdr_write_pages() adds a pad at the beginning of the tail
-	 * if the content in "buf->pages" is unaligned. Force the
-	 * tail's actual content to land at the next XDR position
-	 * after the head instead.
-	 */
-	if (skip) {
-		unsigned char *src, *dst;
-		unsigned int count;
-
-		src = buf->tail[0].iov_base;
-		dst = buf->head[0].iov_base;
-		dst += buf->head[0].iov_len;
-
-		src += skip;
-		tlen -= skip;
-
-		dprintk("RPC:       %s: skip=%zu, memmove(%p, %p, %zu)\n",
-			__func__, skip, dst, src, tlen);
-
-		for (count = tlen; count; count--)
-			*dst++ = *src++;
-	}
-
-	return tlen;
-}
-
 /* Split "vec" on page boundaries into segments. FMR registers pages,
  * not a byte range. Other modes coalesce these segments into a single
  * MR when they can.
@@ -229,7 +187,8 @@ rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg, int n)
 
 static int
 rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
-	enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg)
+	enum rpcrdma_chunktype type, struct rpcrdma_mr_seg *seg,
+	bool reminv_expected)
 {
 	int len, n, p, page_base;
 	struct page **ppages;
@@ -271,6 +230,13 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
 	if (type == rpcrdma_readch)
 		return n;
 
+	/* When encoding the Write list, some servers need to see an extra
+	 * segment for odd-length Write chunks. The upper layer provides
+	 * space in the tail iovec for this purpose.
+	 */
+	if (type == rpcrdma_writech && reminv_expected)
+		return n;
+
 	if (xdrbuf->tail[0].iov_len) {
 		/* the rpcrdma protocol allows us to omit any trailing
 		 * xdr pad bytes, saving the server an RDMA operation. */
@@ -327,7 +293,7 @@ rpcrdma_encode_read_list(struct rpcrdma_xprt *r_xprt,
 	if (rtype == rpcrdma_areadch)
 		pos = 0;
 	seg = req->rl_segments;
-	nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg);
+	nsegs = rpcrdma_convert_iovs(&rqst->rq_snd_buf, pos, rtype, seg, false);
 	if (nsegs < 0)
 		return ERR_PTR(nsegs);
 
@@ -391,7 +357,8 @@ rpcrdma_encode_write_list(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
 	seg = req->rl_segments;
 	nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf,
 				     rqst->rq_rcv_buf.head[0].iov_len,
-				     wtype, seg);
+				     wtype, seg,
+				     r_xprt->rx_ia.ri_reminv_expected);
 	if (nsegs < 0)
 		return ERR_PTR(nsegs);
 
@@ -456,7 +423,8 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
 	}
 
 	seg = req->rl_segments;
-	nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg);
+	nsegs = rpcrdma_convert_iovs(&rqst->rq_rcv_buf, 0, wtype, seg,
+				     r_xprt->rx_ia.ri_reminv_expected);
 	if (nsegs < 0)
 		return ERR_PTR(nsegs);
 
@@ -491,74 +459,184 @@ rpcrdma_encode_reply_chunk(struct rpcrdma_xprt *r_xprt,
 	return iptr;
 }
 
-/*
- * Copy write data inline.
- * This function is used for "small" requests. Data which is passed
- * to RPC via iovecs (or page list) is copied directly into the
- * pre-registered memory buffer for this request. For small amounts
- * of data, this is efficient. The cutoff value is tunable.
+/* Prepare the RPC-over-RDMA header SGE.
  */
-static void rpcrdma_inline_pullup(struct rpc_rqst *rqst)
+static bool
+rpcrdma_prepare_hdr_sge(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
+			u32 len)
 {
-	int i, npages, curlen;
-	int copy_len;
-	unsigned char *srcp, *destp;
-	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
-	int page_base;
-	struct page **ppages;
+	struct rpcrdma_regbuf *rb = req->rl_rdmabuf;
+	struct ib_sge *sge = &req->rl_send_sge[0];
+
+	if (unlikely(!rpcrdma_regbuf_is_mapped(rb))) {
+		if (!__rpcrdma_dma_map_regbuf(ia, rb))
+			return false;
+		sge->addr = rdmab_addr(rb);
+		sge->lkey = rdmab_lkey(rb);
+	}
+	sge->length = len;
+
+	ib_dma_sync_single_for_device(ia->ri_device, sge->addr,
+				      sge->length, DMA_TO_DEVICE);
+	req->rl_send_wr.num_sge++;
+	return true;
+}
 
-	destp = rqst->rq_svec[0].iov_base;
-	curlen = rqst->rq_svec[0].iov_len;
-	destp += curlen;
+/* Prepare the Send SGEs. The head and tail iovec, and each entry
+ * in the page list, gets its own SGE.
+ */
+static bool
+rpcrdma_prepare_msg_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
+			 struct xdr_buf *xdr, enum rpcrdma_chunktype rtype)
+{
+	unsigned int sge_no, page_base, len, remaining;
+	struct rpcrdma_regbuf *rb = req->rl_sendbuf;
+	struct ib_device *device = ia->ri_device;
+	struct ib_sge *sge = req->rl_send_sge;
+	u32 lkey = ia->ri_pd->local_dma_lkey;
+	struct page *page, **ppages;
+
+	/* The head iovec is straightforward, as it is already
+	 * DMA-mapped. Sync the content that has changed.
+	 */
+	if (!rpcrdma_dma_map_regbuf(ia, rb))
+		return false;
+	sge_no = 1;
+	sge[sge_no].addr = rdmab_addr(rb);
+	sge[sge_no].length = xdr->head[0].iov_len;
+	sge[sge_no].lkey = rdmab_lkey(rb);
+	ib_dma_sync_single_for_device(device, sge[sge_no].addr,
+				      sge[sge_no].length, DMA_TO_DEVICE);
+
+	/* If there is a Read chunk, the page list is being handled
+	 * via explicit RDMA, and thus is skipped here. However, the
+	 * tail iovec may include an XDR pad for the page list, as
+	 * well as additional content, and may not reside in the
+	 * same page as the head iovec.
+	 */
+	if (rtype == rpcrdma_readch) {
+		len = xdr->tail[0].iov_len;
 
-	dprintk("RPC:       %s: destp 0x%p len %d hdrlen %d\n",
-		__func__, destp, rqst->rq_slen, curlen);
+		/* Do not include the tail if it is only an XDR pad */
+		if (len < 4)
+			goto out;
 
-	copy_len = rqst->rq_snd_buf.page_len;
+		page = virt_to_page(xdr->tail[0].iov_base);
+		page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
 
-	if (rqst->rq_snd_buf.tail[0].iov_len) {
-		curlen = rqst->rq_snd_buf.tail[0].iov_len;
-		if (destp + copy_len != rqst->rq_snd_buf.tail[0].iov_base) {
-			memmove(destp + copy_len,
-				rqst->rq_snd_buf.tail[0].iov_base, curlen);
-			r_xprt->rx_stats.pullup_copy_count += curlen;
+		/* If the content in the page list is an odd length,
+		 * xdr_write_pages() has added a pad at the beginning
+		 * of the tail iovec. Force the tail's non-pad content
+		 * to land at the next XDR position in the Send message.
+		 */
+		page_base += len & 3;
+		len -= len & 3;
+		goto map_tail;
+	}
+
+	/* If there is a page list present, temporarily DMA map
+	 * and prepare an SGE for each page to be sent.
+	 */
+	if (xdr->page_len) {
+		ppages = xdr->pages + (xdr->page_base >> PAGE_SHIFT);
+		page_base = xdr->page_base & ~PAGE_MASK;
+		remaining = xdr->page_len;
+		while (remaining) {
+			sge_no++;
+			if (sge_no > RPCRDMA_MAX_SEND_SGES - 2)
+				goto out_mapping_overflow;
+
+			len = min_t(u32, PAGE_SIZE - page_base, remaining);
+			sge[sge_no].addr = ib_dma_map_page(device, *ppages,
+							   page_base, len,
+							   DMA_TO_DEVICE);
+			if (ib_dma_mapping_error(device, sge[sge_no].addr))
+				goto out_mapping_err;
+			sge[sge_no].length = len;
+			sge[sge_no].lkey = lkey;
+
+			req->rl_mapped_sges++;
+			ppages++;
+			remaining -= len;
+			page_base = 0;
 		}
-		dprintk("RPC:       %s: tail destp 0x%p len %d\n",
-			__func__, destp + copy_len, curlen);
-		rqst->rq_svec[0].iov_len += curlen;
 	}
-	r_xprt->rx_stats.pullup_copy_count += copy_len;
 
-	page_base = rqst->rq_snd_buf.page_base;
-	ppages = rqst->rq_snd_buf.pages + (page_base >> PAGE_SHIFT);
-	page_base &= ~PAGE_MASK;
-	npages = PAGE_ALIGN(page_base+copy_len) >> PAGE_SHIFT;
-	for (i = 0; copy_len && i < npages; i++) {
-		curlen = PAGE_SIZE - page_base;
-		if (curlen > copy_len)
-			curlen = copy_len;
-		dprintk("RPC:       %s: page %d destp 0x%p len %d curlen %d\n",
-			__func__, i, destp, copy_len, curlen);
-		srcp = kmap_atomic(ppages[i]);
-		memcpy(destp, srcp+page_base, curlen);
-		kunmap_atomic(srcp);
-		rqst->rq_svec[0].iov_len += curlen;
-		destp += curlen;
-		copy_len -= curlen;
-		page_base = 0;
+	/* The tail iovec is not always constructed in the same
+	 * page where the head iovec resides (see, for example,
+	 * gss_wrap_req_priv). To neatly accommodate that case,
+	 * DMA map it separately.
+	 */
+	if (xdr->tail[0].iov_len) {
+		page = virt_to_page(xdr->tail[0].iov_base);
+		page_base = (unsigned long)xdr->tail[0].iov_base & ~PAGE_MASK;
+		len = xdr->tail[0].iov_len;
+
+map_tail:
+		sge_no++;
+		sge[sge_no].addr = ib_dma_map_page(device, page,
+						   page_base, len,
+						   DMA_TO_DEVICE);
+		if (ib_dma_mapping_error(device, sge[sge_no].addr))
+			goto out_mapping_err;
+		sge[sge_no].length = len;
+		sge[sge_no].lkey = lkey;
+		req->rl_mapped_sges++;
 	}
-	/* header now contains entire send message */
+
+out:
+	req->rl_send_wr.num_sge = sge_no + 1;
+	return true;
+
+out_mapping_overflow:
+	pr_err("rpcrdma: too many Send SGEs (%u)\n", sge_no);
+	return false;
+
+out_mapping_err:
+	pr_err("rpcrdma: Send mapping error\n");
+	return false;
+}
+
+bool
+rpcrdma_prepare_send_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req,
+			  u32 hdrlen, struct xdr_buf *xdr,
+			  enum rpcrdma_chunktype rtype)
+{
+	req->rl_send_wr.num_sge = 0;
+	req->rl_mapped_sges = 0;
+
+	if (!rpcrdma_prepare_hdr_sge(ia, req, hdrlen))
+		goto out_map;
+
+	if (rtype != rpcrdma_areadch)
+		if (!rpcrdma_prepare_msg_sges(ia, req, xdr, rtype))
+			goto out_map;
+
+	return true;
+
+out_map:
+	pr_err("rpcrdma: failed to DMA map a Send buffer\n");
+	return false;
+}
+
+void
+rpcrdma_unmap_sges(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+{
+	struct ib_device *device = ia->ri_device;
+	struct ib_sge *sge;
+	int count;
+
+	sge = &req->rl_send_sge[2];
+	for (count = req->rl_mapped_sges; count--; sge++)
+		ib_dma_unmap_page(device, sge->addr, sge->length,
+				  DMA_TO_DEVICE);
+	req->rl_mapped_sges = 0;
 }
 
 /*
  * Marshal a request: the primary job of this routine is to choose
  * the transfer modes. See comments below.
  *
- * Prepares up to two IOVs per Call message:
- *
- *  [0] -- RPC RDMA header
- *  [1] -- the RPC header/data
- *
  * Returns zero on success, otherwise a negative errno.
  */
 
@@ -626,12 +704,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 	 */
 	if (rpcrdma_args_inline(r_xprt, rqst)) {
 		rtype = rpcrdma_noch;
-		rpcrdma_inline_pullup(rqst);
-		rpclen = rqst->rq_svec[0].iov_len;
+		rpclen = rqst->rq_snd_buf.len;
 	} else if (ddp_allowed && rqst->rq_snd_buf.flags & XDRBUF_WRITE) {
 		rtype = rpcrdma_readch;
-		rpclen = rqst->rq_svec[0].iov_len;
-		rpclen += rpcrdma_tail_pullup(&rqst->rq_snd_buf);
+		rpclen = rqst->rq_snd_buf.head[0].iov_len +
+			 rqst->rq_snd_buf.tail[0].iov_len;
 	} else {
 		r_xprt->rx_stats.nomsg_call_count++;
 		headerp->rm_type = htonl(RDMA_NOMSG);
@@ -673,34 +750,18 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
 		goto out_unmap;
 	hdrlen = (unsigned char *)iptr - (unsigned char *)headerp;
 
-	if (hdrlen + rpclen > RPCRDMA_INLINE_WRITE_THRESHOLD(rqst))
-		goto out_overflow;
-
 	dprintk("RPC: %5u %s: %s/%s: hdrlen %zd rpclen %zd\n",
 		rqst->rq_task->tk_pid, __func__,
 		transfertypes[rtype], transfertypes[wtype],
 		hdrlen, rpclen);
 
-	req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
-	req->rl_send_iov[0].length = hdrlen;
-	req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
-
-	req->rl_niovs = 1;
-	if (rtype == rpcrdma_areadch)
-		return 0;
-
-	req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
-	req->rl_send_iov[1].length = rpclen;
-	req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
-
-	req->rl_niovs = 2;
+	if (!rpcrdma_prepare_send_sges(&r_xprt->rx_ia, req, hdrlen,
+				       &rqst->rq_snd_buf, rtype)) {
+		iptr = ERR_PTR(-EIO);
+		goto out_unmap;
+	}
 	return 0;
 
-out_overflow:
-	pr_err("rpcrdma: send overflow: hdrlen %zd rpclen %zu %s/%s\n",
-		hdrlen, rpclen, transfertypes[rtype], transfertypes[wtype]);
-	iptr = ERR_PTR(-EIO);
-
 out_unmap:
 	r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req, false);
 	return PTR_ERR(iptr);
@@ -916,8 +977,10 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
  * allowed to timeout, to discover the errors at that time.
  */
 void
-rpcrdma_reply_handler(struct rpcrdma_rep *rep)
+rpcrdma_reply_handler(struct work_struct *work)
 {
+	struct rpcrdma_rep *rep =
+			container_of(work, struct rpcrdma_rep, rr_work);
 	struct rpcrdma_msg *headerp;
 	struct rpcrdma_req *req;
 	struct rpc_rqst *rqst;
@@ -1132,6 +1195,6 @@ out_duplicate:
 
 repost:
 	r_xprt->rx_stats.bad_reply_count++;
-	if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+	if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, rep))
 		rpcrdma_recv_buffer_put(rep);
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
index a2a7519b0f23..2d8545c34095 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -129,7 +129,7 @@ static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
 		ret = -EIO;
 		goto out_unmap;
 	}
-	atomic_inc(&rdma->sc_dma_used);
+	svc_rdma_count_mappings(rdma, ctxt);
 
 	memset(&send_wr, 0, sizeof(send_wr));
 	ctxt->cqe.done = svc_rdma_wc_send;
@@ -159,33 +159,34 @@ out_unmap:
 /* Server-side transport endpoint wants a whole page for its send
  * buffer. The client RPC code constructs the RPC header in this
  * buffer before it invokes ->send_request.
- *
- * Returns NULL if there was a temporary allocation failure.
  */
-static void *
-xprt_rdma_bc_allocate(struct rpc_task *task, size_t size)
+static int
+xprt_rdma_bc_allocate(struct rpc_task *task)
 {
 	struct rpc_rqst *rqst = task->tk_rqstp;
 	struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
+	size_t size = rqst->rq_callsize;
 	struct svcxprt_rdma *rdma;
 	struct page *page;
 
 	rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
 
-	/* Prevent an infinite loop: try to make this case work */
-	if (size > PAGE_SIZE)
+	if (size > PAGE_SIZE) {
 		WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n",
 			  size);
+		return -EINVAL;
+	}
 
 	page = alloc_page(RPCRDMA_DEF_GFP);
 	if (!page)
-		return NULL;
+		return -ENOMEM;
 
-	return page_address(page);
+	rqst->rq_buffer = page_address(page);
+	return 0;
 }
 
 static void
-xprt_rdma_bc_free(void *buffer)
+xprt_rdma_bc_free(struct rpc_task *task)
 {
 	/* No-op: ctxt and page have already been freed. */
 }
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index 2c25606f2561..ad1df979b3f0 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -159,7 +159,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
 					   ctxt->sge[pno].addr);
 		if (ret)
 			goto err;
-		atomic_inc(&xprt->sc_dma_used);
+		svc_rdma_count_mappings(xprt, ctxt);
 
 		ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey;
 		ctxt->sge[pno].length = len;
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 54d533300620..f5a91edcd233 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -225,6 +225,48 @@ svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp,
 	return rp_ary;
 }
 
+/* RPC-over-RDMA Version One private extension: Remote Invalidation.
+ * Responder's choice: requester signals it can handle Send With
+ * Invalidate, and responder chooses one rkey to invalidate.
+ *
+ * Find a candidate rkey to invalidate when sending a reply.  Picks the
+ * first rkey it finds in the chunks lists.
+ *
+ * Returns zero if RPC's chunk lists are empty.
+ */
+static u32 svc_rdma_get_inv_rkey(struct rpcrdma_msg *rdma_argp,
+				 struct rpcrdma_write_array *wr_ary,
+				 struct rpcrdma_write_array *rp_ary)
+{
+	struct rpcrdma_read_chunk *rd_ary;
+	struct rpcrdma_segment *arg_ch;
+	u32 inv_rkey;
+
+	inv_rkey = 0;
+
+	rd_ary = svc_rdma_get_read_chunk(rdma_argp);
+	if (rd_ary) {
+		inv_rkey = be32_to_cpu(rd_ary->rc_target.rs_handle);
+		goto out;
+	}
+
+	if (wr_ary && be32_to_cpu(wr_ary->wc_nchunks)) {
+		arg_ch = &wr_ary->wc_array[0].wc_target;
+		inv_rkey = be32_to_cpu(arg_ch->rs_handle);
+		goto out;
+	}
+
+	if (rp_ary && be32_to_cpu(rp_ary->wc_nchunks)) {
+		arg_ch = &rp_ary->wc_array[0].wc_target;
+		inv_rkey = be32_to_cpu(arg_ch->rs_handle);
+		goto out;
+	}
+
+out:
+	dprintk("svcrdma: Send With Invalidate rkey=%08x\n", inv_rkey);
+	return inv_rkey;
+}
+
 /* Assumptions:
  * - The specified write_len can be represented in sc_max_sge * PAGE_SIZE
  */
@@ -280,7 +322,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
 		if (ib_dma_mapping_error(xprt->sc_cm_id->device,
 					 sge[sge_no].addr))
 			goto err;
-		atomic_inc(&xprt->sc_dma_used);
+		svc_rdma_count_mappings(xprt, ctxt);
 		sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
 		ctxt->count++;
 		sge_off = 0;
@@ -464,7 +506,8 @@ static int send_reply(struct svcxprt_rdma *rdma,
 		      struct page *page,
 		      struct rpcrdma_msg *rdma_resp,
 		      struct svc_rdma_req_map *vec,
-		      int byte_count)
+		      int byte_count,
+		      u32 inv_rkey)
 {
 	struct svc_rdma_op_ctxt *ctxt;
 	struct ib_send_wr send_wr;
@@ -489,7 +532,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
 			    ctxt->sge[0].length, DMA_TO_DEVICE);
 	if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr))
 		goto err;
-	atomic_inc(&rdma->sc_dma_used);
+	svc_rdma_count_mappings(rdma, ctxt);
 
 	ctxt->direction = DMA_TO_DEVICE;
 
@@ -505,7 +548,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
 		if (ib_dma_mapping_error(rdma->sc_cm_id->device,
 					 ctxt->sge[sge_no].addr))
 			goto err;
-		atomic_inc(&rdma->sc_dma_used);
+		svc_rdma_count_mappings(rdma, ctxt);
 		ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
 		ctxt->sge[sge_no].length = sge_bytes;
 	}
@@ -523,23 +566,9 @@ static int send_reply(struct svcxprt_rdma *rdma,
 		ctxt->pages[page_no+1] = rqstp->rq_respages[page_no];
 		ctxt->count++;
 		rqstp->rq_respages[page_no] = NULL;
-		/*
-		 * If there are more pages than SGE, terminate SGE
-		 * list so that svc_rdma_unmap_dma doesn't attempt to
-		 * unmap garbage.
-		 */
-		if (page_no+1 >= sge_no)
-			ctxt->sge[page_no+1].length = 0;
 	}
 	rqstp->rq_next_page = rqstp->rq_respages + 1;
 
-	/* The loop above bumps sc_dma_used for each sge. The
-	 * xdr_buf.tail gets a separate sge, but resides in the
-	 * same page as xdr_buf.head. Don't count it twice.
-	 */
-	if (sge_no > ctxt->count)
-		atomic_dec(&rdma->sc_dma_used);
-
 	if (sge_no > rdma->sc_max_sge) {
 		pr_err("svcrdma: Too many sges (%d)\n", sge_no);
 		goto err;
@@ -549,7 +578,11 @@ static int send_reply(struct svcxprt_rdma *rdma,
 	send_wr.wr_cqe = &ctxt->cqe;
 	send_wr.sg_list = ctxt->sge;
 	send_wr.num_sge = sge_no;
-	send_wr.opcode = IB_WR_SEND;
+	if (inv_rkey) {
+		send_wr.opcode = IB_WR_SEND_WITH_INV;
+		send_wr.ex.invalidate_rkey = inv_rkey;
+	} else
+		send_wr.opcode = IB_WR_SEND;
 	send_wr.send_flags =  IB_SEND_SIGNALED;
 
 	ret = svc_rdma_send(rdma, &send_wr);
@@ -581,6 +614,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	int inline_bytes;
 	struct page *res_page;
 	struct svc_rdma_req_map *vec;
+	u32 inv_rkey;
 
 	dprintk("svcrdma: sending response for rqstp=%p\n", rqstp);
 
@@ -591,6 +625,10 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 	wr_ary = svc_rdma_get_write_array(rdma_argp);
 	rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary);
 
+	inv_rkey = 0;
+	if (rdma->sc_snd_w_inv)
+		inv_rkey = svc_rdma_get_inv_rkey(rdma_argp, wr_ary, rp_ary);
+
 	/* Build an req vec for the XDR */
 	vec = svc_rdma_get_req_map(rdma);
 	ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL);
@@ -633,9 +671,9 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
 		goto err1;
 
 	ret = send_reply(rdma, rqstp, res_page, rdma_resp, vec,
-			 inline_bytes);
+			 inline_bytes, inv_rkey);
 	if (ret < 0)
-		goto err1;
+		goto err0;
 
 	svc_rdma_put_req_map(rdma, vec);
 	dprintk("svcrdma: send_reply returns %d\n", ret);
@@ -692,7 +730,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
 		svc_rdma_put_context(ctxt, 1);
 		return;
 	}
-	atomic_inc(&xprt->sc_dma_used);
+	svc_rdma_count_mappings(xprt, ctxt);
 
 	/* Prepare SEND WR */
 	memset(&err_wr, 0, sizeof(err_wr));
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index eb2857f52b05..6864fb967038 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -198,6 +198,7 @@ struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
 
 out:
 	ctxt->count = 0;
+	ctxt->mapped_sges = 0;
 	ctxt->frmr = NULL;
 	return ctxt;
 
@@ -221,22 +222,27 @@ out_empty:
 void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
 {
 	struct svcxprt_rdma *xprt = ctxt->xprt;
-	int i;
-	for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
+	struct ib_device *device = xprt->sc_cm_id->device;
+	u32 lkey = xprt->sc_pd->local_dma_lkey;
+	unsigned int i, count;
+
+	for (count = 0, i = 0; i < ctxt->mapped_sges; i++) {
 		/*
 		 * Unmap the DMA addr in the SGE if the lkey matches
 		 * the local_dma_lkey, otherwise, ignore it since it is
 		 * an FRMR lkey and will be unmapped later when the
 		 * last WR that uses it completes.
 		 */
-		if (ctxt->sge[i].lkey == xprt->sc_pd->local_dma_lkey) {
-			atomic_dec(&xprt->sc_dma_used);
-			ib_dma_unmap_page(xprt->sc_cm_id->device,
+		if (ctxt->sge[i].lkey == lkey) {
+			count++;
+			ib_dma_unmap_page(device,
 					    ctxt->sge[i].addr,
 					    ctxt->sge[i].length,
 					    ctxt->direction);
 		}
 	}
+	ctxt->mapped_sges = 0;
+	atomic_sub(count, &xprt->sc_dma_used);
 }
 
 void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
@@ -600,7 +606,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags)
 				     DMA_FROM_DEVICE);
 		if (ib_dma_mapping_error(xprt->sc_cm_id->device, pa))
 			goto err_put_ctxt;
-		atomic_inc(&xprt->sc_dma_used);
+		svc_rdma_count_mappings(xprt, ctxt);
 		ctxt->sge[sge_no].addr = pa;
 		ctxt->sge[sge_no].length = PAGE_SIZE;
 		ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
@@ -642,6 +648,26 @@ int svc_rdma_repost_recv(struct svcxprt_rdma *xprt, gfp_t flags)
 	return ret;
 }
 
+static void
+svc_rdma_parse_connect_private(struct svcxprt_rdma *newxprt,
+			       struct rdma_conn_param *param)
+{
+	const struct rpcrdma_connect_private *pmsg = param->private_data;
+
+	if (pmsg &&
+	    pmsg->cp_magic == rpcrdma_cmp_magic &&
+	    pmsg->cp_version == RPCRDMA_CMP_VERSION) {
+		newxprt->sc_snd_w_inv = pmsg->cp_flags &
+					RPCRDMA_CMP_F_SND_W_INV_OK;
+
+		dprintk("svcrdma: client send_size %u, recv_size %u "
+			"remote inv %ssupported\n",
+			rpcrdma_decode_buffer_size(pmsg->cp_send_size),
+			rpcrdma_decode_buffer_size(pmsg->cp_recv_size),
+			newxprt->sc_snd_w_inv ? "" : "un");
+	}
+}
+
 /*
  * This function handles the CONNECT_REQUEST event on a listening
  * endpoint. It is passed the cma_id for the _new_ connection. The context in
@@ -653,7 +679,8 @@ int svc_rdma_repost_recv(struct svcxprt_rdma *xprt, gfp_t flags)
  * will call the recvfrom method on the listen xprt which will accept the new
  * connection.
  */
-static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird)
+static void handle_connect_req(struct rdma_cm_id *new_cma_id,
+			       struct rdma_conn_param *param)
 {
 	struct svcxprt_rdma *listen_xprt = new_cma_id->context;
 	struct svcxprt_rdma *newxprt;
@@ -669,9 +696,10 @@ static void handle_connect_req(struct rdma_cm_id *new_cma_id, size_t client_ird)
 	new_cma_id->context = newxprt;
 	dprintk("svcrdma: Creating newxprt=%p, cm_id=%p, listenxprt=%p\n",
 		newxprt, newxprt->sc_cm_id, listen_xprt);
+	svc_rdma_parse_connect_private(newxprt, param);
 
 	/* Save client advertised inbound read limit for use later in accept. */
-	newxprt->sc_ord = client_ird;
+	newxprt->sc_ord = param->initiator_depth;
 
 	/* Set the local and remote addresses in the transport */
 	sa = (struct sockaddr *)&newxprt->sc_cm_id->route.addr.dst_addr;
@@ -706,8 +734,7 @@ static int rdma_listen_handler(struct rdma_cm_id *cma_id,
 		dprintk("svcrdma: Connect request on cma_id=%p, xprt = %p, "
 			"event = %s (%d)\n", cma_id, cma_id->context,
 			rdma_event_msg(event->event), event->event);
-		handle_connect_req(cma_id,
-				   event->param.conn.initiator_depth);
+		handle_connect_req(cma_id, &event->param.conn);
 		break;
 
 	case RDMA_CM_EVENT_ESTABLISHED:
@@ -941,6 +968,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	struct svcxprt_rdma *listen_rdma;
 	struct svcxprt_rdma *newxprt = NULL;
 	struct rdma_conn_param conn_param;
+	struct rpcrdma_connect_private pmsg;
 	struct ib_qp_init_attr qp_attr;
 	struct ib_device *dev;
 	unsigned int i;
@@ -1070,7 +1098,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 			dev->attrs.max_fast_reg_page_list_len;
 		newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
 		newxprt->sc_reader = rdma_read_chunk_frmr;
-	}
+	} else
+		newxprt->sc_snd_w_inv = false;
 
 	/*
 	 * Determine if a DMA MR is required and if so, what privs are required
@@ -1094,11 +1123,20 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
 	/* Swap out the handler */
 	newxprt->sc_cm_id->event_handler = rdma_cma_handler;
 
+	/* Construct RDMA-CM private message */
+	pmsg.cp_magic = rpcrdma_cmp_magic;
+	pmsg.cp_version = RPCRDMA_CMP_VERSION;
+	pmsg.cp_flags = 0;
+	pmsg.cp_send_size = pmsg.cp_recv_size =
+		rpcrdma_encode_buffer_size(newxprt->sc_max_req_size);
+
 	/* Accept Connection */
 	set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
 	memset(&conn_param, 0, sizeof conn_param);
 	conn_param.responder_resources = 0;
 	conn_param.initiator_depth = newxprt->sc_ord;
+	conn_param.private_data = &pmsg;
+	conn_param.private_data_len = sizeof(pmsg);
 	ret = rdma_accept(newxprt->sc_cm_id, &conn_param);
 	if (ret) {
 		dprintk("svcrdma: failed to accept new connection, ret=%d\n",
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 81f0e879f019..ed5e285fd2ea 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -97,7 +97,7 @@ static struct ctl_table xr_tunables_table[] = {
 		.data		= &xprt_rdma_max_inline_read,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &min_inline_size,
 		.extra2		= &max_inline_size,
 	},
@@ -106,7 +106,7 @@ static struct ctl_table xr_tunables_table[] = {
 		.data		= &xprt_rdma_max_inline_write,
 		.maxlen		= sizeof(unsigned int),
 		.mode		= 0644,
-		.proc_handler	= proc_dointvec,
+		.proc_handler	= proc_dointvec_minmax,
 		.extra1		= &min_inline_size,
 		.extra2		= &max_inline_size,
 	},
@@ -477,115 +477,152 @@ xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
 	}
 }
 
-/*
- * The RDMA allocate/free functions need the task structure as a place
- * to hide the struct rpcrdma_req, which is necessary for the actual send/recv
- * sequence.
+/* Allocate a fixed-size buffer in which to construct and send the
+ * RPC-over-RDMA header for this request.
+ */
+static bool
+rpcrdma_get_rdmabuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+		    gfp_t flags)
+{
+	size_t size = RPCRDMA_HDRBUF_SIZE;
+	struct rpcrdma_regbuf *rb;
+
+	if (req->rl_rdmabuf)
+		return true;
+
+	rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags);
+	if (IS_ERR(rb))
+		return false;
+
+	r_xprt->rx_stats.hardway_register_count += size;
+	req->rl_rdmabuf = rb;
+	return true;
+}
+
+static bool
+rpcrdma_get_sendbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+		    size_t size, gfp_t flags)
+{
+	struct rpcrdma_regbuf *rb;
+
+	if (req->rl_sendbuf && rdmab_length(req->rl_sendbuf) >= size)
+		return true;
+
+	rb = rpcrdma_alloc_regbuf(size, DMA_TO_DEVICE, flags);
+	if (IS_ERR(rb))
+		return false;
+
+	rpcrdma_free_regbuf(req->rl_sendbuf);
+	r_xprt->rx_stats.hardway_register_count += size;
+	req->rl_sendbuf = rb;
+	return true;
+}
+
+/* The rq_rcv_buf is used only if a Reply chunk is necessary.
+ * The decision to use a Reply chunk is made later in
+ * rpcrdma_marshal_req. This buffer is registered at that time.
  *
- * The RPC layer allocates both send and receive buffers in the same call
- * (rq_send_buf and rq_rcv_buf are both part of a single contiguous buffer).
- * We may register rq_rcv_buf when using reply chunks.
+ * Otherwise, the associated RPC Reply arrives in a separate
+ * Receive buffer, arbitrarily chosen by the HCA. The buffer
+ * allocated here for the RPC Reply is not utilized in that
+ * case. See rpcrdma_inline_fixup.
+ *
+ * A regbuf is used here to remember the buffer size.
  */
-static void *
-xprt_rdma_allocate(struct rpc_task *task, size_t size)
+static bool
+rpcrdma_get_recvbuf(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req,
+		    size_t size, gfp_t flags)
 {
-	struct rpc_xprt *xprt = task->tk_rqstp->rq_xprt;
-	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
 	struct rpcrdma_regbuf *rb;
+
+	if (req->rl_recvbuf && rdmab_length(req->rl_recvbuf) >= size)
+		return true;
+
+	rb = rpcrdma_alloc_regbuf(size, DMA_NONE, flags);
+	if (IS_ERR(rb))
+		return false;
+
+	rpcrdma_free_regbuf(req->rl_recvbuf);
+	r_xprt->rx_stats.hardway_register_count += size;
+	req->rl_recvbuf = rb;
+	return true;
+}
+
+/**
+ * xprt_rdma_allocate - allocate transport resources for an RPC
+ * @task: RPC task
+ *
+ * Return values:
+ *        0:	Success; rq_buffer points to RPC buffer to use
+ *   ENOMEM:	Out of memory, call again later
+ *      EIO:	A permanent error occurred, do not retry
+ *
+ * The RDMA allocate/free functions need the task structure as a place
+ * to hide the struct rpcrdma_req, which is necessary for the actual
+ * send/recv sequence.
+ *
+ * xprt_rdma_allocate provides buffers that are already mapped for
+ * DMA, and a local DMA lkey is provided for each.
+ */
+static int
+xprt_rdma_allocate(struct rpc_task *task)
+{
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
 	struct rpcrdma_req *req;
-	size_t min_size;
 	gfp_t flags;
 
 	req = rpcrdma_buffer_get(&r_xprt->rx_buf);
 	if (req == NULL)
-		return NULL;
+		return -ENOMEM;
 
 	flags = RPCRDMA_DEF_GFP;
 	if (RPC_IS_SWAPPER(task))
 		flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
 
-	if (req->rl_rdmabuf == NULL)
-		goto out_rdmabuf;
-	if (req->rl_sendbuf == NULL)
-		goto out_sendbuf;
-	if (size > req->rl_sendbuf->rg_size)
-		goto out_sendbuf;
-
-out:
-	dprintk("RPC:       %s: size %zd, request 0x%p\n", __func__, size, req);
-	req->rl_connect_cookie = 0;	/* our reserved value */
-	req->rl_task = task;
-	return req->rl_sendbuf->rg_base;
-
-out_rdmabuf:
-	min_size = RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp);
-	rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, min_size, flags);
-	if (IS_ERR(rb))
+	if (!rpcrdma_get_rdmabuf(r_xprt, req, flags))
 		goto out_fail;
-	req->rl_rdmabuf = rb;
-
-out_sendbuf:
-	/* XDR encoding and RPC/RDMA marshaling of this request has not
-	 * yet occurred. Thus a lower bound is needed to prevent buffer
-	 * overrun during marshaling.
-	 *
-	 * RPC/RDMA marshaling may choose to send payload bearing ops
-	 * inline, if the result is smaller than the inline threshold.
-	 * The value of the "size" argument accounts for header
-	 * requirements but not for the payload in these cases.
-	 *
-	 * Likewise, allocate enough space to receive a reply up to the
-	 * size of the inline threshold.
-	 *
-	 * It's unlikely that both the send header and the received
-	 * reply will be large, but slush is provided here to allow
-	 * flexibility when marshaling.
-	 */
-	min_size = RPCRDMA_INLINE_READ_THRESHOLD(task->tk_rqstp);
-	min_size += RPCRDMA_INLINE_WRITE_THRESHOLD(task->tk_rqstp);
-	if (size < min_size)
-		size = min_size;
-
-	rb = rpcrdma_alloc_regbuf(&r_xprt->rx_ia, size, flags);
-	if (IS_ERR(rb))
+	if (!rpcrdma_get_sendbuf(r_xprt, req, rqst->rq_callsize, flags))
+		goto out_fail;
+	if (!rpcrdma_get_recvbuf(r_xprt, req, rqst->rq_rcvsize, flags))
 		goto out_fail;
-	rb->rg_owner = req;
 
-	r_xprt->rx_stats.hardway_register_count += size;
-	rpcrdma_free_regbuf(&r_xprt->rx_ia, req->rl_sendbuf);
-	req->rl_sendbuf = rb;
-	goto out;
+	dprintk("RPC: %5u %s: send size = %zd, recv size = %zd, req = %p\n",
+		task->tk_pid, __func__, rqst->rq_callsize,
+		rqst->rq_rcvsize, req);
+
+	req->rl_connect_cookie = 0;	/* our reserved value */
+	rpcrdma_set_xprtdata(rqst, req);
+	rqst->rq_buffer = req->rl_sendbuf->rg_base;
+	rqst->rq_rbuffer = req->rl_recvbuf->rg_base;
+	return 0;
 
 out_fail:
 	rpcrdma_buffer_put(req);
-	return NULL;
+	return -ENOMEM;
 }
 
-/*
- * This function returns all RDMA resources to the pool.
+/**
+ * xprt_rdma_free - release resources allocated by xprt_rdma_allocate
+ * @task: RPC task
+ *
+ * Caller guarantees rqst->rq_buffer is non-NULL.
  */
 static void
-xprt_rdma_free(void *buffer)
+xprt_rdma_free(struct rpc_task *task)
 {
-	struct rpcrdma_req *req;
-	struct rpcrdma_xprt *r_xprt;
-	struct rpcrdma_regbuf *rb;
-
-	if (buffer == NULL)
-		return;
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(rqst->rq_xprt);
+	struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
 
-	rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]);
-	req = rb->rg_owner;
 	if (req->rl_backchannel)
 		return;
 
-	r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
-
 	dprintk("RPC:       %s: called on 0x%p\n", __func__, req->rl_reply);
 
-	r_xprt->rx_ia.ri_ops->ro_unmap_safe(r_xprt, req,
-					    !RPC_IS_ASYNC(req->rl_task));
-
+	ia->ri_ops->ro_unmap_safe(r_xprt, req, !RPC_IS_ASYNC(task));
+	rpcrdma_unmap_sges(ia, req);
 	rpcrdma_buffer_put(req);
 }
 
@@ -685,10 +722,11 @@ void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
 		   r_xprt->rx_stats.failed_marshal_count,
 		   r_xprt->rx_stats.bad_reply_count,
 		   r_xprt->rx_stats.nomsg_call_count);
-	seq_printf(seq, "%lu %lu %lu\n",
+	seq_printf(seq, "%lu %lu %lu %lu\n",
 		   r_xprt->rx_stats.mrs_recovered,
 		   r_xprt->rx_stats.mrs_orphaned,
-		   r_xprt->rx_stats.mrs_allocated);
+		   r_xprt->rx_stats.mrs_allocated,
+		   r_xprt->rx_stats.local_inv_needed);
 }
 
 static int
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index be3178e5e2d2..ec74289af7ec 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -129,15 +129,6 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
 		       wc->status, wc->vendor_err);
 }
 
-static void
-rpcrdma_receive_worker(struct work_struct *work)
-{
-	struct rpcrdma_rep *rep =
-			container_of(work, struct rpcrdma_rep, rr_work);
-
-	rpcrdma_reply_handler(rep);
-}
-
 /* Perform basic sanity checking to avoid using garbage
  * to update the credit grant value.
  */
@@ -161,13 +152,13 @@ rpcrdma_update_granted_credits(struct rpcrdma_rep *rep)
 }
 
 /**
- * rpcrdma_receive_wc - Invoked by RDMA provider for each polled Receive WC
+ * rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
  * @cq:	completion queue (ignored)
  * @wc:	completed WR
  *
  */
 static void
-rpcrdma_receive_wc(struct ib_cq *cq, struct ib_wc *wc)
+rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
 {
 	struct ib_cqe *cqe = wc->wr_cqe;
 	struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
@@ -185,6 +176,9 @@ rpcrdma_receive_wc(struct ib_cq *cq, struct ib_wc *wc)
 		__func__, rep, wc->byte_len);
 
 	rep->rr_len = wc->byte_len;
+	rep->rr_wc_flags = wc->wc_flags;
+	rep->rr_inv_rkey = wc->ex.invalidate_rkey;
+
 	ib_dma_sync_single_for_cpu(rep->rr_device,
 				   rdmab_addr(rep->rr_rdmabuf),
 				   rep->rr_len, DMA_FROM_DEVICE);
@@ -204,6 +198,36 @@ out_fail:
 	goto out_schedule;
 }
 
+static void
+rpcrdma_update_connect_private(struct rpcrdma_xprt *r_xprt,
+			       struct rdma_conn_param *param)
+{
+	struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
+	const struct rpcrdma_connect_private *pmsg = param->private_data;
+	unsigned int rsize, wsize;
+
+	/* Default settings for RPC-over-RDMA Version One */
+	r_xprt->rx_ia.ri_reminv_expected = false;
+	rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
+	wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
+
+	if (pmsg &&
+	    pmsg->cp_magic == rpcrdma_cmp_magic &&
+	    pmsg->cp_version == RPCRDMA_CMP_VERSION) {
+		r_xprt->rx_ia.ri_reminv_expected = true;
+		rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
+		wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
+	}
+
+	if (rsize < cdata->inline_rsize)
+		cdata->inline_rsize = rsize;
+	if (wsize < cdata->inline_wsize)
+		cdata->inline_wsize = wsize;
+	pr_info("rpcrdma: max send %u, max recv %u\n",
+		cdata->inline_wsize, cdata->inline_rsize);
+	rpcrdma_set_max_header_sizes(r_xprt);
+}
+
 static int
 rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
 {
@@ -244,6 +268,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
 			" (%d initiator)\n",
 			__func__, attr->max_dest_rd_atomic,
 			attr->max_rd_atomic);
+		rpcrdma_update_connect_private(xprt, &event->param.conn);
 		goto connected;
 	case RDMA_CM_EVENT_CONNECT_ERROR:
 		connstate = -ENOTCONN;
@@ -454,11 +479,12 @@ int
 rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 				struct rpcrdma_create_data_internal *cdata)
 {
+	struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
 	struct ib_cq *sendcq, *recvcq;
 	unsigned int max_qp_wr;
 	int rc;
 
-	if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_IOVS) {
+	if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_SEND_SGES) {
 		dprintk("RPC:       %s: insufficient sge's available\n",
 			__func__);
 		return -ENOMEM;
@@ -487,7 +513,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 	ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
 	ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
 	ep->rep_attr.cap.max_recv_wr += 1;	/* drain cqe */
-	ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
+	ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_SEND_SGES;
 	ep->rep_attr.cap.max_recv_sge = 1;
 	ep->rep_attr.cap.max_inline_data = 0;
 	ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -536,9 +562,14 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
 	/* Initialize cma parameters */
 	memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
 
-	/* RPC/RDMA does not use private data */
-	ep->rep_remote_cma.private_data = NULL;
-	ep->rep_remote_cma.private_data_len = 0;
+	/* Prepare RDMA-CM private message */
+	pmsg->cp_magic = rpcrdma_cmp_magic;
+	pmsg->cp_version = RPCRDMA_CMP_VERSION;
+	pmsg->cp_flags |= ia->ri_ops->ro_send_w_inv_ok;
+	pmsg->cp_send_size = rpcrdma_encode_buffer_size(cdata->inline_wsize);
+	pmsg->cp_recv_size = rpcrdma_encode_buffer_size(cdata->inline_rsize);
+	ep->rep_remote_cma.private_data = pmsg;
+	ep->rep_remote_cma.private_data_len = sizeof(*pmsg);
 
 	/* Client offers RDMA Read but does not initiate */
 	ep->rep_remote_cma.initiator_depth = 0;
@@ -849,6 +880,10 @@ rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
 	req->rl_cqe.done = rpcrdma_wc_send;
 	req->rl_buffer = &r_xprt->rx_buf;
 	INIT_LIST_HEAD(&req->rl_registered);
+	req->rl_send_wr.next = NULL;
+	req->rl_send_wr.wr_cqe = &req->rl_cqe;
+	req->rl_send_wr.sg_list = req->rl_send_sge;
+	req->rl_send_wr.opcode = IB_WR_SEND;
 	return req;
 }
 
@@ -865,17 +900,21 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
 	if (rep == NULL)
 		goto out;
 
-	rep->rr_rdmabuf = rpcrdma_alloc_regbuf(ia, cdata->inline_rsize,
-					       GFP_KERNEL);
+	rep->rr_rdmabuf = rpcrdma_alloc_regbuf(cdata->inline_rsize,
+					       DMA_FROM_DEVICE, GFP_KERNEL);
 	if (IS_ERR(rep->rr_rdmabuf)) {
 		rc = PTR_ERR(rep->rr_rdmabuf);
 		goto out_free;
 	}
 
 	rep->rr_device = ia->ri_device;
-	rep->rr_cqe.done = rpcrdma_receive_wc;
+	rep->rr_cqe.done = rpcrdma_wc_receive;
 	rep->rr_rxprt = r_xprt;
-	INIT_WORK(&rep->rr_work, rpcrdma_receive_worker);
+	INIT_WORK(&rep->rr_work, rpcrdma_reply_handler);
+	rep->rr_recv_wr.next = NULL;
+	rep->rr_recv_wr.wr_cqe = &rep->rr_cqe;
+	rep->rr_recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
+	rep->rr_recv_wr.num_sge = 1;
 	return rep;
 
 out_free:
@@ -966,17 +1005,18 @@ rpcrdma_buffer_get_rep_locked(struct rpcrdma_buffer *buf)
 }
 
 static void
-rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
+rpcrdma_destroy_rep(struct rpcrdma_rep *rep)
 {
-	rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
+	rpcrdma_free_regbuf(rep->rr_rdmabuf);
 	kfree(rep);
 }
 
 void
-rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+rpcrdma_destroy_req(struct rpcrdma_req *req)
 {
-	rpcrdma_free_regbuf(ia, req->rl_sendbuf);
-	rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
+	rpcrdma_free_regbuf(req->rl_recvbuf);
+	rpcrdma_free_regbuf(req->rl_sendbuf);
+	rpcrdma_free_regbuf(req->rl_rdmabuf);
 	kfree(req);
 }
 
@@ -1009,15 +1049,13 @@ rpcrdma_destroy_mrs(struct rpcrdma_buffer *buf)
 void
 rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 {
-	struct rpcrdma_ia *ia = rdmab_to_ia(buf);
-
 	cancel_delayed_work_sync(&buf->rb_recovery_worker);
 
 	while (!list_empty(&buf->rb_recv_bufs)) {
 		struct rpcrdma_rep *rep;
 
 		rep = rpcrdma_buffer_get_rep_locked(buf);
-		rpcrdma_destroy_rep(ia, rep);
+		rpcrdma_destroy_rep(rep);
 	}
 	buf->rb_send_count = 0;
 
@@ -1030,7 +1068,7 @@ rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
 		list_del(&req->rl_all);
 
 		spin_unlock(&buf->rb_reqslock);
-		rpcrdma_destroy_req(ia, req);
+		rpcrdma_destroy_req(req);
 		spin_lock(&buf->rb_reqslock);
 	}
 	spin_unlock(&buf->rb_reqslock);
@@ -1129,7 +1167,7 @@ rpcrdma_buffer_put(struct rpcrdma_req *req)
 	struct rpcrdma_buffer *buffers = req->rl_buffer;
 	struct rpcrdma_rep *rep = req->rl_reply;
 
-	req->rl_niovs = 0;
+	req->rl_send_wr.num_sge = 0;
 	req->rl_reply = NULL;
 
 	spin_lock(&buffers->rb_lock);
@@ -1171,70 +1209,81 @@ rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
 	spin_unlock(&buffers->rb_lock);
 }
 
-/*
- * Wrappers for internal-use kmalloc memory registration, used by buffer code.
- */
-
 /**
- * rpcrdma_alloc_regbuf - kmalloc and register memory for SEND/RECV buffers
- * @ia: controlling rpcrdma_ia
+ * rpcrdma_alloc_regbuf - allocate and DMA-map memory for SEND/RECV buffers
  * @size: size of buffer to be allocated, in bytes
+ * @direction: direction of data movement
  * @flags: GFP flags
  *
- * Returns pointer to private header of an area of internally
- * registered memory, or an ERR_PTR. The registered buffer follows
- * the end of the private header.
+ * Returns an ERR_PTR, or a pointer to a regbuf, a buffer that
+ * can be persistently DMA-mapped for I/O.
  *
  * xprtrdma uses a regbuf for posting an outgoing RDMA SEND, or for
- * receiving the payload of RDMA RECV operations. regbufs are not
- * used for RDMA READ/WRITE operations, thus are registered only for
- * LOCAL access.
+ * receiving the payload of RDMA RECV operations. During Long Calls
+ * or Replies they may be registered externally via ro_map.
  */
 struct rpcrdma_regbuf *
-rpcrdma_alloc_regbuf(struct rpcrdma_ia *ia, size_t size, gfp_t flags)
+rpcrdma_alloc_regbuf(size_t size, enum dma_data_direction direction,
+		     gfp_t flags)
 {
 	struct rpcrdma_regbuf *rb;
-	struct ib_sge *iov;
 
 	rb = kmalloc(sizeof(*rb) + size, flags);
 	if (rb == NULL)
-		goto out;
+		return ERR_PTR(-ENOMEM);
 
-	iov = &rb->rg_iov;
-	iov->addr = ib_dma_map_single(ia->ri_device,
-				      (void *)rb->rg_base, size,
-				      DMA_BIDIRECTIONAL);
-	if (ib_dma_mapping_error(ia->ri_device, iov->addr))
-		goto out_free;
+	rb->rg_device = NULL;
+	rb->rg_direction = direction;
+	rb->rg_iov.length = size;
 
-	iov->length = size;
-	iov->lkey = ia->ri_pd->local_dma_lkey;
-	rb->rg_size = size;
-	rb->rg_owner = NULL;
 	return rb;
+}
 
-out_free:
-	kfree(rb);
-out:
-	return ERR_PTR(-ENOMEM);
+/**
+ * __rpcrdma_map_regbuf - DMA-map a regbuf
+ * @ia: controlling rpcrdma_ia
+ * @rb: regbuf to be mapped
+ */
+bool
+__rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+{
+	if (rb->rg_direction == DMA_NONE)
+		return false;
+
+	rb->rg_iov.addr = ib_dma_map_single(ia->ri_device,
+					    (void *)rb->rg_base,
+					    rdmab_length(rb),
+					    rb->rg_direction);
+	if (ib_dma_mapping_error(ia->ri_device, rdmab_addr(rb)))
+		return false;
+
+	rb->rg_device = ia->ri_device;
+	rb->rg_iov.lkey = ia->ri_pd->local_dma_lkey;
+	return true;
+}
+
+static void
+rpcrdma_dma_unmap_regbuf(struct rpcrdma_regbuf *rb)
+{
+	if (!rpcrdma_regbuf_is_mapped(rb))
+		return;
+
+	ib_dma_unmap_single(rb->rg_device, rdmab_addr(rb),
+			    rdmab_length(rb), rb->rg_direction);
+	rb->rg_device = NULL;
 }
 
 /**
  * rpcrdma_free_regbuf - deregister and free registered buffer
- * @ia: controlling rpcrdma_ia
  * @rb: regbuf to be deregistered and freed
  */
 void
-rpcrdma_free_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+rpcrdma_free_regbuf(struct rpcrdma_regbuf *rb)
 {
-	struct ib_sge *iov;
-
 	if (!rb)
 		return;
 
-	iov = &rb->rg_iov;
-	ib_dma_unmap_single(ia->ri_device,
-			    iov->addr, iov->length, DMA_BIDIRECTIONAL);
+	rpcrdma_dma_unmap_regbuf(rb);
 	kfree(rb);
 }
 
@@ -1248,39 +1297,28 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
 		struct rpcrdma_ep *ep,
 		struct rpcrdma_req *req)
 {
-	struct ib_device *device = ia->ri_device;
-	struct ib_send_wr send_wr, *send_wr_fail;
-	struct rpcrdma_rep *rep = req->rl_reply;
-	struct ib_sge *iov = req->rl_send_iov;
-	int i, rc;
+	struct ib_send_wr *send_wr = &req->rl_send_wr;
+	struct ib_send_wr *send_wr_fail;
+	int rc;
 
-	if (rep) {
-		rc = rpcrdma_ep_post_recv(ia, ep, rep);
+	if (req->rl_reply) {
+		rc = rpcrdma_ep_post_recv(ia, req->rl_reply);
 		if (rc)
 			return rc;
 		req->rl_reply = NULL;
 	}
 
-	send_wr.next = NULL;
-	send_wr.wr_cqe = &req->rl_cqe;
-	send_wr.sg_list = iov;
-	send_wr.num_sge = req->rl_niovs;
-	send_wr.opcode = IB_WR_SEND;
-
-	for (i = 0; i < send_wr.num_sge; i++)
-		ib_dma_sync_single_for_device(device, iov[i].addr,
-					      iov[i].length, DMA_TO_DEVICE);
 	dprintk("RPC:       %s: posting %d s/g entries\n",
-		__func__, send_wr.num_sge);
+		__func__, send_wr->num_sge);
 
 	if (DECR_CQCOUNT(ep) > 0)
-		send_wr.send_flags = 0;
+		send_wr->send_flags = 0;
 	else { /* Provider must take a send completion every now and then */
 		INIT_CQCOUNT(ep);
-		send_wr.send_flags = IB_SEND_SIGNALED;
+		send_wr->send_flags = IB_SEND_SIGNALED;
 	}
 
-	rc = ib_post_send(ia->ri_id->qp, &send_wr, &send_wr_fail);
+	rc = ib_post_send(ia->ri_id->qp, send_wr, &send_wr_fail);
 	if (rc)
 		goto out_postsend_err;
 	return 0;
@@ -1290,32 +1328,24 @@ out_postsend_err:
 	return -ENOTCONN;
 }
 
-/*
- * (Re)post a receive buffer.
- */
 int
 rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
-		     struct rpcrdma_ep *ep,
 		     struct rpcrdma_rep *rep)
 {
-	struct ib_recv_wr recv_wr, *recv_wr_fail;
+	struct ib_recv_wr *recv_wr_fail;
 	int rc;
 
-	recv_wr.next = NULL;
-	recv_wr.wr_cqe = &rep->rr_cqe;
-	recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
-	recv_wr.num_sge = 1;
-
-	ib_dma_sync_single_for_cpu(ia->ri_device,
-				   rdmab_addr(rep->rr_rdmabuf),
-				   rdmab_length(rep->rr_rdmabuf),
-				   DMA_BIDIRECTIONAL);
-
-	rc = ib_post_recv(ia->ri_id->qp, &recv_wr, &recv_wr_fail);
+	if (!rpcrdma_dma_map_regbuf(ia, rep->rr_rdmabuf))
+		goto out_map;
+	rc = ib_post_recv(ia->ri_id->qp, &rep->rr_recv_wr, &recv_wr_fail);
 	if (rc)
 		goto out_postrecv;
 	return 0;
 
+out_map:
+	pr_err("rpcrdma: failed to DMA map the Receive buffer\n");
+	return -EIO;
+
 out_postrecv:
 	pr_err("rpcrdma: ib_post_recv returned %i\n", rc);
 	return -ENOTCONN;
@@ -1333,7 +1363,6 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
 {
 	struct rpcrdma_buffer *buffers = &r_xprt->rx_buf;
 	struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-	struct rpcrdma_ep *ep = &r_xprt->rx_ep;
 	struct rpcrdma_rep *rep;
 	int rc;
 
@@ -1344,7 +1373,7 @@ rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
 		rep = rpcrdma_buffer_get_rep_locked(buffers);
 		spin_unlock(&buffers->rb_lock);
 
-		rc = rpcrdma_ep_post_recv(ia, ep, rep);
+		rc = rpcrdma_ep_post_recv(ia, rep);
 		if (rc)
 			goto out_rc;
 	}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index a71b0f5897d8..0d35b761c883 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -70,9 +70,11 @@ struct rpcrdma_ia {
 	struct ib_pd		*ri_pd;
 	struct completion	ri_done;
 	int			ri_async_rc;
+	unsigned int		ri_max_segs;
 	unsigned int		ri_max_frmr_depth;
 	unsigned int		ri_max_inline_write;
 	unsigned int		ri_max_inline_read;
+	bool			ri_reminv_expected;
 	struct ib_qp_attr	ri_qp_attr;
 	struct ib_qp_init_attr	ri_qp_init_attr;
 };
@@ -87,6 +89,7 @@ struct rpcrdma_ep {
 	int			rep_connected;
 	struct ib_qp_init_attr	rep_attr;
 	wait_queue_head_t 	rep_connect_wait;
+	struct rpcrdma_connect_private	rep_cm_private;
 	struct rdma_conn_param	rep_remote_cma;
 	struct sockaddr_storage	rep_remote_addr;
 	struct delayed_work	rep_connect_worker;
@@ -112,9 +115,9 @@ struct rpcrdma_ep {
  */
 
 struct rpcrdma_regbuf {
-	size_t			rg_size;
-	struct rpcrdma_req	*rg_owner;
 	struct ib_sge		rg_iov;
+	struct ib_device	*rg_device;
+	enum dma_data_direction	rg_direction;
 	__be32			rg_base[0] __attribute__ ((aligned(256)));
 };
 
@@ -162,7 +165,10 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
  * The smallest inline threshold is 1024 bytes, ensuring that
  * at least 750 bytes are available for RPC messages.
  */
-#define RPCRDMA_MAX_HDR_SEGS	(8)
+enum {
+	RPCRDMA_MAX_HDR_SEGS = 8,
+	RPCRDMA_HDRBUF_SIZE = 256,
+};
 
 /*
  * struct rpcrdma_rep -- this structure encapsulates state required to recv
@@ -182,10 +188,13 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
 struct rpcrdma_rep {
 	struct ib_cqe		rr_cqe;
 	unsigned int		rr_len;
+	int			rr_wc_flags;
+	u32			rr_inv_rkey;
 	struct ib_device	*rr_device;
 	struct rpcrdma_xprt	*rr_rxprt;
 	struct work_struct	rr_work;
 	struct list_head	rr_list;
+	struct ib_recv_wr	rr_recv_wr;
 	struct rpcrdma_regbuf	*rr_rdmabuf;
 };
 
@@ -276,19 +285,30 @@ struct rpcrdma_mr_seg {		/* chunk descriptors */
 	char		*mr_offset;	/* kva if no page, else offset */
 };
 
-#define RPCRDMA_MAX_IOVS	(2)
+/* Reserve enough Send SGEs to send a maximum size inline request:
+ * - RPC-over-RDMA header
+ * - xdr_buf head iovec
+ * - RPCRDMA_MAX_INLINE bytes, possibly unaligned, in pages
+ * - xdr_buf tail iovec
+ */
+enum {
+	RPCRDMA_MAX_SEND_PAGES = PAGE_SIZE + RPCRDMA_MAX_INLINE - 1,
+	RPCRDMA_MAX_PAGE_SGES = (RPCRDMA_MAX_SEND_PAGES >> PAGE_SHIFT) + 1,
+	RPCRDMA_MAX_SEND_SGES = 1 + 1 + RPCRDMA_MAX_PAGE_SGES + 1,
+};
 
 struct rpcrdma_buffer;
 struct rpcrdma_req {
 	struct list_head	rl_free;
-	unsigned int		rl_niovs;
+	unsigned int		rl_mapped_sges;
 	unsigned int		rl_connect_cookie;
-	struct rpc_task		*rl_task;
 	struct rpcrdma_buffer	*rl_buffer;
-	struct rpcrdma_rep	*rl_reply;/* holder for reply buffer */
-	struct ib_sge		rl_send_iov[RPCRDMA_MAX_IOVS];
-	struct rpcrdma_regbuf	*rl_rdmabuf;
-	struct rpcrdma_regbuf	*rl_sendbuf;
+	struct rpcrdma_rep	*rl_reply;
+	struct ib_send_wr	rl_send_wr;
+	struct ib_sge		rl_send_sge[RPCRDMA_MAX_SEND_SGES];
+	struct rpcrdma_regbuf	*rl_rdmabuf;	/* xprt header */
+	struct rpcrdma_regbuf	*rl_sendbuf;	/* rq_snd_buf */
+	struct rpcrdma_regbuf	*rl_recvbuf;	/* rq_rcv_buf */
 
 	struct ib_cqe		rl_cqe;
 	struct list_head	rl_all;
@@ -298,14 +318,16 @@ struct rpcrdma_req {
 	struct rpcrdma_mr_seg	rl_segments[RPCRDMA_MAX_SEGS];
 };
 
+static inline void
+rpcrdma_set_xprtdata(struct rpc_rqst *rqst, struct rpcrdma_req *req)
+{
+	rqst->rq_xprtdata = req;
+}
+
 static inline struct rpcrdma_req *
 rpcr_to_rdmar(struct rpc_rqst *rqst)
 {
-	void *buffer = rqst->rq_buffer;
-	struct rpcrdma_regbuf *rb;
-
-	rb = container_of(buffer, struct rpcrdma_regbuf, rg_base);
-	return rb->rg_owner;
+	return rqst->rq_xprtdata;
 }
 
 /*
@@ -356,15 +378,6 @@ struct rpcrdma_create_data_internal {
 	unsigned int	padding;	/* non-rdma write header padding */
 };
 
-#define RPCRDMA_INLINE_READ_THRESHOLD(rq) \
-	(rpcx_to_rdmad(rq->rq_xprt).inline_rsize)
-
-#define RPCRDMA_INLINE_WRITE_THRESHOLD(rq)\
-	(rpcx_to_rdmad(rq->rq_xprt).inline_wsize)
-
-#define RPCRDMA_INLINE_PAD_VALUE(rq)\
-	rpcx_to_rdmad(rq->rq_xprt).padding
-
 /*
  * Statistics for RPCRDMA
  */
@@ -386,6 +399,7 @@ struct rpcrdma_stats {
 	unsigned long		mrs_recovered;
 	unsigned long		mrs_orphaned;
 	unsigned long		mrs_allocated;
+	unsigned long		local_inv_needed;
 };
 
 /*
@@ -409,6 +423,7 @@ struct rpcrdma_memreg_ops {
 				      struct rpcrdma_mw *);
 	void		(*ro_release_mr)(struct rpcrdma_mw *);
 	const char	*ro_displayname;
+	const int	ro_send_w_inv_ok;
 };
 
 extern const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops;
@@ -461,15 +476,14 @@ void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
 
 int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
 				struct rpcrdma_req *);
-int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
-				struct rpcrdma_rep *);
+int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_rep *);
 
 /*
  * Buffer calls - xprtrdma/verbs.c
  */
 struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *);
 struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *);
-void rpcrdma_destroy_req(struct rpcrdma_ia *, struct rpcrdma_req *);
+void rpcrdma_destroy_req(struct rpcrdma_req *);
 int rpcrdma_buffer_create(struct rpcrdma_xprt *);
 void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
 
@@ -482,10 +496,24 @@ void rpcrdma_recv_buffer_put(struct rpcrdma_rep *);
 
 void rpcrdma_defer_mr_recovery(struct rpcrdma_mw *);
 
-struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(struct rpcrdma_ia *,
-					    size_t, gfp_t);
-void rpcrdma_free_regbuf(struct rpcrdma_ia *,
-			 struct rpcrdma_regbuf *);
+struct rpcrdma_regbuf *rpcrdma_alloc_regbuf(size_t, enum dma_data_direction,
+					    gfp_t);
+bool __rpcrdma_dma_map_regbuf(struct rpcrdma_ia *, struct rpcrdma_regbuf *);
+void rpcrdma_free_regbuf(struct rpcrdma_regbuf *);
+
+static inline bool
+rpcrdma_regbuf_is_mapped(struct rpcrdma_regbuf *rb)
+{
+	return rb->rg_device != NULL;
+}
+
+static inline bool
+rpcrdma_dma_map_regbuf(struct rpcrdma_ia *ia, struct rpcrdma_regbuf *rb)
+{
+	if (likely(rpcrdma_regbuf_is_mapped(rb)))
+		return true;
+	return __rpcrdma_dma_map_regbuf(ia, rb);
+}
 
 int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
 
@@ -507,15 +535,25 @@ rpcrdma_data_dir(bool writing)
  */
 void rpcrdma_connect_worker(struct work_struct *);
 void rpcrdma_conn_func(struct rpcrdma_ep *);
-void rpcrdma_reply_handler(struct rpcrdma_rep *);
+void rpcrdma_reply_handler(struct work_struct *);
 
 /*
  * RPC/RDMA protocol calls - xprtrdma/rpc_rdma.c
  */
+
+enum rpcrdma_chunktype {
+	rpcrdma_noch = 0,
+	rpcrdma_readch,
+	rpcrdma_areadch,
+	rpcrdma_writech,
+	rpcrdma_replych
+};
+
+bool rpcrdma_prepare_send_sges(struct rpcrdma_ia *, struct rpcrdma_req *,
+			       u32, struct xdr_buf *, enum rpcrdma_chunktype);
+void rpcrdma_unmap_sges(struct rpcrdma_ia *, struct rpcrdma_req *);
 int rpcrdma_marshal_req(struct rpc_rqst *);
-void rpcrdma_set_max_header_sizes(struct rpcrdma_ia *,
-				  struct rpcrdma_create_data_internal *,
-				  unsigned int);
+void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
 
 /* RPC/RDMA module init - xprtrdma/transport.c
  */
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index bf168838a029..0137af1c0916 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -473,7 +473,16 @@ static int xs_nospace(struct rpc_task *task)
 	spin_unlock_bh(&xprt->transport_lock);
 
 	/* Race breaker in case memory is freed before above code is called */
-	sk->sk_write_space(sk);
+	if (ret == -EAGAIN) {
+		struct socket_wq *wq;
+
+		rcu_read_lock();
+		wq = rcu_dereference(sk->sk_wq);
+		set_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags);
+		rcu_read_unlock();
+
+		sk->sk_write_space(sk);
+	}
 	return ret;
 }
 
@@ -2533,35 +2542,38 @@ static void xs_tcp_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
  * we allocate pages instead doing a kmalloc like rpc_malloc is because we want
  * to use the server side send routines.
  */
-static void *bc_malloc(struct rpc_task *task, size_t size)
+static int bc_malloc(struct rpc_task *task)
 {
+	struct rpc_rqst *rqst = task->tk_rqstp;
+	size_t size = rqst->rq_callsize;
 	struct page *page;
 	struct rpc_buffer *buf;
 
-	WARN_ON_ONCE(size > PAGE_SIZE - sizeof(struct rpc_buffer));
-	if (size > PAGE_SIZE - sizeof(struct rpc_buffer))
-		return NULL;
+	if (size > PAGE_SIZE - sizeof(struct rpc_buffer)) {
+		WARN_ONCE(1, "xprtsock: large bc buffer request (size %zu)\n",
+			  size);
+		return -EINVAL;
+	}
 
 	page = alloc_page(GFP_KERNEL);
 	if (!page)
-		return NULL;
+		return -ENOMEM;
 
 	buf = page_address(page);
 	buf->len = PAGE_SIZE;
 
-	return buf->data;
+	rqst->rq_buffer = buf->data;
+	return 0;
 }
 
 /*
  * Free the space allocated in the bc_alloc routine
  */
-static void bc_free(void *buffer)
+static void bc_free(struct rpc_task *task)
 {
+	void *buffer = task->tk_rqstp->rq_buffer;
 	struct rpc_buffer *buf;
 
-	if (!buffer)
-		return;
-
 	buf = container_of(buffer, struct rpc_buffer, data);
 	free_page((unsigned long)buf);
 }
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index d80cd3f7503f..78cab9c5a445 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -407,6 +407,7 @@ static int __tipc_nl_add_udp_addr(struct sk_buff *skb,
 	if (ntohs(addr->proto) == ETH_P_IP) {
 		struct sockaddr_in ip4;
 
+		memset(&ip4, 0, sizeof(ip4));
 		ip4.sin_family = AF_INET;
 		ip4.sin_port = addr->port;
 		ip4.sin_addr.s_addr = addr->ipv4.s_addr;
@@ -417,6 +418,7 @@ static int __tipc_nl_add_udp_addr(struct sk_buff *skb,
 	} else if (ntohs(addr->proto) == ETH_P_IPV6) {
 		struct sockaddr_in6 ip6;
 
+		memset(&ip6, 0, sizeof(ip6));
 		ip6.sin6_family = AF_INET6;
 		ip6.sin6_port  = addr->port;
 		memcpy(&ip6.sin6_addr, &addr->ipv6, sizeof(struct in6_addr));