summary refs log tree commit diff
path: root/fs
diff options
context:
space:
mode:
authorCristian Ciocaltea <cristian.ciocaltea@collabora.com>2023-09-15 19:15:48 +0300
committerCristian Ciocaltea <cristian.ciocaltea@collabora.com>2023-09-15 19:15:48 +0300
commitc1358d94ea1fe5705dc7bd4542f262151c9844f6 (patch)
tree515339e48fa02fc454a241921c18a30d9e215b9f /fs
parent768bc393d4e78159a2765b0b4bb49ee3375921d1 (diff)
parenta325a7b546ec539ca7877f258bb59beff9ef41fc (diff)
downloadlinux-c1358d94ea1fe5705dc7bd4542f262151c9844f6.tar.gz
Merge branch 6.1/features/mm-soft-dirty-file
Signed-off-by: Cristian Ciocaltea <cristian.ciocaltea@collabora.com>
Diffstat (limited to 'fs')
-rw-r--r--fs/proc/base.c3
-rw-r--r--fs/proc/internal.h1
-rw-r--r--fs/proc/task_mmu.c292
3 files changed, 236 insertions, 60 deletions
diff --git a/fs/proc/base.c b/fs/proc/base.c
index 9e479d7d202b..811eccf23be1 100644
--- a/fs/proc/base.c
+++ b/fs/proc/base.c
@@ -3281,6 +3281,9 @@ static const struct pid_entry tgid_base_stuff[] = {
 	REG("smaps",      S_IRUGO, proc_pid_smaps_operations),
 	REG("smaps_rollup", S_IRUGO, proc_pid_smaps_rollup_operations),
 	REG("pagemap",    S_IRUSR, proc_pagemap_operations),
+#ifdef CONFIG_MEM_SOFT_DIRTY
+	REG("pagemap_reset", S_IRUSR, proc_pagemap_reset_operations),
+#endif
 #endif
 #ifdef CONFIG_SECURITY
 	DIR("attr",       S_IRUGO|S_IXUGO, proc_attr_dir_inode_operations, proc_attr_dir_operations),
diff --git a/fs/proc/internal.h b/fs/proc/internal.h
index b701d0207edf..836c28e46556 100644
--- a/fs/proc/internal.h
+++ b/fs/proc/internal.h
@@ -305,6 +305,7 @@ extern const struct file_operations proc_pid_smaps_operations;
 extern const struct file_operations proc_pid_smaps_rollup_operations;
 extern const struct file_operations proc_clear_refs_operations;
 extern const struct file_operations proc_pagemap_operations;
+extern const struct file_operations proc_pagemap_reset_operations;
 
 extern unsigned long task_vsize(struct mm_struct *);
 extern unsigned long task_statm(struct mm_struct *,
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index a954305fbc31..b2795bd98f40 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -1073,6 +1073,8 @@ enum clear_refs_types {
 
 struct clear_refs_private {
 	enum clear_refs_types type;
+	unsigned long start, end;
+	bool clear_range;
 };
 
 #ifdef CONFIG_MEM_SOFT_DIRTY
@@ -1093,8 +1095,8 @@ static inline bool pte_is_pinned(struct vm_area_struct *vma, unsigned long addr,
 	return page_maybe_dma_pinned(page);
 }
 
-static inline void clear_soft_dirty(struct vm_area_struct *vma,
-		unsigned long addr, pte_t *pte)
+static inline bool clear_soft_dirty(struct vm_area_struct *vma,
+				    unsigned long addr, pte_t *pte)
 {
 	/*
 	 * The soft-dirty tracker uses #PF-s to catch writes
@@ -1103,37 +1105,48 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma,
 	 * of how soft-dirty works.
 	 */
 	pte_t ptent = *pte;
+	bool ret = false;
 
 	if (pte_present(ptent)) {
 		pte_t old_pte;
 
 		if (pte_is_pinned(vma, addr, ptent))
-			return;
+			return ret;
 		old_pte = ptep_modify_prot_start(vma, addr, pte);
+		ret = pte_soft_dirty(old_pte);
 		ptent = pte_wrprotect(old_pte);
 		ptent = pte_clear_soft_dirty(ptent);
+		ptent = pte_mkuffd_wp(ptent);
 		ptep_modify_prot_commit(vma, addr, pte, old_pte, ptent);
 	} else if (is_swap_pte(ptent)) {
+		ret = pte_swp_soft_dirty(ptent);
 		ptent = pte_swp_clear_soft_dirty(ptent);
+		ptent = pte_swp_mkuffd_wp(ptent);
 		set_pte_at(vma->vm_mm, addr, pte, ptent);
 	}
+    return ret;
 }
 #else
-static inline void clear_soft_dirty(struct vm_area_struct *vma,
+static inline bool clear_soft_dirty(struct vm_area_struct *vma,
 		unsigned long addr, pte_t *pte)
 {
+    return false;
 }
 #endif
 
 #if defined(CONFIG_MEM_SOFT_DIRTY) && defined(CONFIG_TRANSPARENT_HUGEPAGE)
-static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t *pmdp)
 {
 	pmd_t old, pmd = *pmdp;
+	bool ret = false;
 
 	if (pmd_present(pmd)) {
 		/* See comment in change_huge_pmd() */
 		old = pmdp_invalidate(vma, addr, pmdp);
+
+		ret = pmd_soft_dirty(old);
+
 		if (pmd_dirty(old))
 			pmd = pmd_mkdirty(pmd);
 		if (pmd_young(old))
@@ -1144,14 +1157,17 @@ static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
 
 		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
 	} else if (is_migration_entry(pmd_to_swp_entry(pmd))) {
+		ret = pmd_swp_soft_dirty(pmd);
 		pmd = pmd_swp_clear_soft_dirty(pmd);
 		set_pmd_at(vma->vm_mm, addr, pmdp, pmd);
 	}
+    return ret;
 }
 #else
-static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma,
+static inline bool clear_soft_dirty_pmd(struct vm_area_struct *vma,
 		unsigned long addr, pmd_t *pmdp)
 {
+    return false;
 }
 #endif
 
@@ -1160,14 +1176,19 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr,
 {
 	struct clear_refs_private *cp = walk->private;
 	struct vm_area_struct *vma = walk->vma;
+	unsigned long start = addr;
 	pte_t *pte, ptent;
 	spinlock_t *ptl;
 	struct page *page;
 
+	BUG_ON(addr < cp->start || end > cp->end);
+
 	ptl = pmd_trans_huge_lock(pmd, vma);
 	if (ptl) {
 		if (cp->type == CLEAR_REFS_SOFT_DIRTY) {
 			clear_soft_dirty_pmd(vma, addr, pmd);
+			if (cp->clear_range)
+				flush_tlb_range(vma, addr, end);
 			goto out;
 		}
 
@@ -1210,6 +1231,9 @@ out:
 		ClearPageReferenced(page);
 	}
 	pte_unmap_unlock(pte - 1, ptl);
+	if (cp->type == CLEAR_REFS_SOFT_DIRTY && cp->clear_range)
+		flush_tlb_range(vma, start, end);
+
 	cond_resched();
 	return 0;
 }
@@ -1220,9 +1244,11 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end,
 	struct clear_refs_private *cp = walk->private;
 	struct vm_area_struct *vma = walk->vma;
 
-	if (vma->vm_flags & VM_PFNMAP)
+	if (!cp->clear_range && (vma->vm_flags & VM_PFNMAP))
 		return 1;
 
+	BUG_ON(start < cp->start || end > cp->end);
+
 	/*
 	 * Writing 1 to /proc/pid/clear_refs affects all pages.
 	 * Writing 2 to /proc/pid/clear_refs only affects anonymous pages.
@@ -1245,10 +1271,12 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 				size_t count, loff_t *ppos)
 {
 	struct task_struct *task;
-	char buffer[PROC_NUMBUF];
+	char buffer[18];
 	struct mm_struct *mm;
 	struct vm_area_struct *vma;
 	enum clear_refs_types type;
+	unsigned long start, end;
+	bool clear_range;
 	int itype;
 	int rv;
 
@@ -1257,12 +1285,34 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 		count = sizeof(buffer) - 1;
 	if (copy_from_user(buffer, buf, count))
 		return -EFAULT;
-	rv = kstrtoint(strstrip(buffer), 10, &itype);
-	if (rv < 0)
-		return rv;
-	type = (enum clear_refs_types)itype;
-	if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
-		return -EINVAL;
+
+	if (buffer[0] == '6')
+	{
+		static int once;
+
+		if (!once++)
+			printk(KERN_DEBUG "task_mmu: Using POC clear refs range implementation.\n");
+
+		if (count != 17)
+			return -EINVAL;
+
+		type = CLEAR_REFS_SOFT_DIRTY;
+		start = *(unsigned long *)(buffer + 1);
+		end = *(unsigned long *)(buffer + 1 + 8);
+	}
+	else
+	{
+		rv = kstrtoint(strstrip(buffer), 10, &itype);
+		if (rv < 0)
+			return rv;
+		type = (enum clear_refs_types)itype;
+
+		if (type < CLEAR_REFS_ALL || type >= CLEAR_REFS_LAST)
+			return -EINVAL;
+
+		start = 0;
+		end = -1UL;
+	}
 
 	task = get_proc_task(file_inode(file));
 	if (!task)
@@ -1275,40 +1325,77 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf,
 			.type = type,
 		};
 
-		if (mmap_write_lock_killable(mm)) {
-			count = -EINTR;
-			goto out_mm;
+		if (start || end != -1UL) {
+			if (start >= end) {
+				count = -EINVAL;
+				goto out_mm;
+			}
+			clear_range = true;
+		} else {
+			clear_range = false;
 		}
+
+		cp.start = start;
+		cp.end = end;
+		cp.clear_range = clear_range;
+
 		if (type == CLEAR_REFS_MM_HIWATER_RSS) {
+			if (mmap_write_lock_killable(mm)) {
+				count = -EINTR;
+				goto out_mm;
+			}
+
 			/*
 			 * Writing 5 to /proc/pid/clear_refs resets the peak
 			 * resident set size to this mm's current rss value.
 			 */
 			reset_mm_hiwater_rss(mm);
-			goto out_unlock;
+			mmap_write_unlock(mm);
+			goto out_mm;
 		}
 
 		if (type == CLEAR_REFS_SOFT_DIRTY) {
-			mas_for_each(&mas, vma, ULONG_MAX) {
-				if (!(vma->vm_flags & VM_SOFTDIRTY))
-					continue;
-				vma->vm_flags &= ~VM_SOFTDIRTY;
-				vma_set_page_prot(vma);
+			if (mmap_read_lock_killable(mm)) {
+				count = -EINTR;
+				goto out_mm;
+			}
+			if (!clear_range) {
+				mas_for_each(&mas, vma, ULONG_MAX) {
+					if (!(vma->vm_flags & VM_SOFTDIRTY))
+						continue;
+					mmap_read_unlock(mm);
+					if (mmap_write_lock_killable(mm)) {
+						count = -EINTR;
+						goto out_mm;
+					}
+					mas_for_each(&mas, vma, ULONG_MAX) {
+						vma->vm_flags &= ~VM_SOFTDIRTY;
+						vma_set_page_prot(vma);
+					}
+					mmap_write_downgrade(mm);
+					break;
+				}
 			}
-
 			inc_tlb_flush_pending(mm);
 			mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
-						0, NULL, mm, 0, -1UL);
+						0, NULL, mm, start, end);
 			mmu_notifier_invalidate_range_start(&range);
+		} else {
+			if (mmap_write_lock_killable(mm)) {
+				count = -EINTR;
+				goto out_mm;
+			}
 		}
-		walk_page_range(mm, 0, -1, &clear_refs_walk_ops, &cp);
+		walk_page_range(mm, start, end, &clear_refs_walk_ops, &cp);
 		if (type == CLEAR_REFS_SOFT_DIRTY) {
 			mmu_notifier_invalidate_range_end(&range);
-			flush_tlb_mm(mm);
+			if (!clear_range)
+			    flush_tlb_mm(mm);
 			dec_tlb_flush_pending(mm);
+			mmap_read_unlock(mm);
+		} else {
+			mmap_write_unlock(mm);
 		}
-out_unlock:
-		mmap_write_unlock(mm);
 out_mm:
 		mmput(mm);
 	}
@@ -1330,6 +1417,7 @@ struct pagemapread {
 	int pos, len;		/* units: PM_ENTRY_BYTES, not bytes */
 	pagemap_entry_t *buffer;
 	bool show_pfn;
+	bool reset;
 };
 
 #define PAGEMAP_WALK_SIZE	(PMD_SIZE)
@@ -1341,6 +1429,7 @@ struct pagemapread {
 #define PM_SOFT_DIRTY		BIT_ULL(55)
 #define PM_MMAP_EXCLUSIVE	BIT_ULL(56)
 #define PM_UFFD_WP		BIT_ULL(57)
+#define PM_SOFT_DIRTY_PAGE	PM_UFFD_WP
 #define PM_FILE			BIT_ULL(61)
 #define PM_SWAP			BIT_ULL(62)
 #define PM_PRESENT		BIT_ULL(63)
@@ -1361,6 +1450,14 @@ static int add_to_pagemap(unsigned long addr, pagemap_entry_t *pme,
 	return 0;
 }
 
+static int add_addr_to_pagemap(unsigned long addr, struct pagemapread *pm)
+{
+	((unsigned long *)pm->buffer)[pm->pos++] = addr;
+	if (pm->pos >= pm->len)
+		return PM_END_OF_BUFFER;
+	return 0;
+}
+
 static int pagemap_pte_hole(unsigned long start, unsigned long end,
 			    __always_unused int depth, struct mm_walk *walk)
 {
@@ -1368,6 +1465,9 @@ static int pagemap_pte_hole(unsigned long start, unsigned long end,
 	unsigned long addr = start;
 	int err = 0;
 
+	if (pm->reset)
+		goto out;
+
 	while (addr < end) {
 		struct vm_area_struct *vma = find_vma(walk->mm, addr);
 		pagemap_entry_t pme = make_pme(0, 0);
@@ -1402,8 +1502,9 @@ out:
 }
 
 static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
-		struct vm_area_struct *vma, unsigned long addr, pte_t pte)
+		struct vm_area_struct *vma, unsigned long addr, pte_t *pte_addr)
 {
+	pte_t pte = *pte_addr;
 	u64 frame = 0, flags = 0;
 	struct page *page = NULL;
 	bool migration = false;
@@ -1414,15 +1515,11 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 		flags |= PM_PRESENT;
 		page = vm_normal_page(vma, addr, pte);
 		if (pte_soft_dirty(pte))
-			flags |= PM_SOFT_DIRTY;
-		if (pte_uffd_wp(pte))
-			flags |= PM_UFFD_WP;
+			flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE;
 	} else if (is_swap_pte(pte)) {
 		swp_entry_t entry;
 		if (pte_swp_soft_dirty(pte))
-			flags |= PM_SOFT_DIRTY;
-		if (pte_swp_uffd_wp(pte))
-			flags |= PM_UFFD_WP;
+			flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE;
 		entry = pte_to_swp_entry(pte);
 		if (pm->show_pfn) {
 			pgoff_t offset;
@@ -1441,8 +1538,6 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm,
 		migration = is_migration_entry(entry);
 		if (is_pfn_swap_entry(entry))
 			page = pfn_swap_entry_to_page(entry);
-		if (pte_marker_entry_uffd_wp(entry))
-			flags |= PM_UFFD_WP;
 	}
 
 	if (page && !PageAnon(page))
@@ -1460,6 +1555,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 {
 	struct vm_area_struct *vma = walk->vma;
 	struct pagemapread *pm = walk->private;
+	long start = addr;
 	spinlock_t *ptl;
 	pte_t *pte, *orig_pte;
 	int err = 0;
@@ -1472,6 +1568,21 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 		pmd_t pmd = *pmdp;
 		struct page *page = NULL;
 
+		if (pm->reset)
+		{
+			if (clear_soft_dirty_pmd(vma, addr, pmdp))
+			{
+				for (; addr != end; addr += PAGE_SIZE)
+				{
+					err = add_addr_to_pagemap(addr, pm);
+					if (err)
+						break;
+				}
+			}
+			flush_tlb_range(vma, start, end);
+			goto trans_huge_done;
+		}
+
 		if (vma->vm_flags & VM_SOFTDIRTY)
 			flags |= PM_SOFT_DIRTY;
 
@@ -1480,9 +1591,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 
 			flags |= PM_PRESENT;
 			if (pmd_soft_dirty(pmd))
-				flags |= PM_SOFT_DIRTY;
-			if (pmd_uffd_wp(pmd))
-				flags |= PM_UFFD_WP;
+				flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE;
 			if (pm->show_pfn)
 				frame = pmd_pfn(pmd) +
 					((addr & ~PMD_MASK) >> PAGE_SHIFT);
@@ -1504,9 +1613,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 			}
 			flags |= PM_SWAP;
 			if (pmd_swp_soft_dirty(pmd))
-				flags |= PM_SOFT_DIRTY;
-			if (pmd_swp_uffd_wp(pmd))
-				flags |= PM_UFFD_WP;
+				flags |= PM_SOFT_DIRTY | PM_SOFT_DIRTY_PAGE;
 			VM_BUG_ON(!is_pmd_migration_entry(pmd));
 			migration = is_migration_entry(entry);
 			page = pfn_swap_entry_to_page(entry);
@@ -1529,6 +1636,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 					frame += (1 << MAX_SWAPFILES_SHIFT);
 			}
 		}
+trans_huge_done:
 		spin_unlock(ptl);
 		return err;
 	}
@@ -1543,14 +1651,24 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end,
 	 */
 	orig_pte = pte = pte_offset_map_lock(walk->mm, pmdp, addr, &ptl);
 	for (; addr < end; pte++, addr += PAGE_SIZE) {
-		pagemap_entry_t pme;
+		if (pm->reset)
+		{
+			if (clear_soft_dirty(vma, addr, pte))
+			    err = add_addr_to_pagemap(addr, pm);
+		}
+		else
+		{
+			pagemap_entry_t pme;
 
-		pme = pte_to_pagemap_entry(pm, vma, addr, *pte);
-		err = add_to_pagemap(addr, &pme, pm);
+			pme = pte_to_pagemap_entry(pm, vma, addr, pte);
+			err = add_to_pagemap(addr, &pme, pm);
+		}
 		if (err)
 			break;
 	}
 	pte_unmap_unlock(orig_pte, ptl);
+	if (pm->reset)
+	    flush_tlb_range(vma, start, end);
 
 	cond_resched();
 
@@ -1582,15 +1700,10 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask,
 		if (page_mapcount(page) == 1)
 			flags |= PM_MMAP_EXCLUSIVE;
 
-		if (huge_pte_uffd_wp(pte))
-			flags |= PM_UFFD_WP;
-
 		flags |= PM_PRESENT;
 		if (pm->show_pfn)
 			frame = pte_pfn(pte) +
 				((addr & ~hmask) >> PAGE_SHIFT);
-	} else if (pte_swp_uffd_wp_any(pte)) {
-		flags |= PM_UFFD_WP;
 	}
 
 	for (; addr != end; addr += PAGE_SIZE) {
@@ -1644,8 +1757,8 @@ static const struct mm_walk_ops pagemap_ops = {
  * determine which areas of memory are actually mapped and llseek to
  * skip over unmapped regions.
  */
-static ssize_t pagemap_read(struct file *file, char __user *buf,
-			    size_t count, loff_t *ppos)
+static ssize_t do_pagemap_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *ppos, bool reset)
 {
 	struct mm_struct *mm = file->private_data;
 	struct pagemapread pm;
@@ -1654,6 +1767,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 	unsigned long start_vaddr;
 	unsigned long end_vaddr;
 	int ret = 0, copied = 0;
+	struct mmu_notifier_range range;
+	size_t buffer_len;
 
 	if (!mm || !mmget_not_zero(mm))
 		goto out;
@@ -1669,19 +1784,38 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 
 	/* do not disclose physical addresses: attack vector */
 	pm.show_pfn = file_ns_capable(file, &init_user_ns, CAP_SYS_ADMIN);
+	pm.reset = reset;
+
+	buffer_len = min(PAGEMAP_WALK_SIZE >> PAGE_SHIFT, count / PM_ENTRY_BYTES);
 
-	pm.len = (PAGEMAP_WALK_SIZE >> PAGE_SHIFT);
-	pm.buffer = kmalloc_array(pm.len, PM_ENTRY_BYTES, GFP_KERNEL);
+	pm.buffer = kmalloc_array(buffer_len, PM_ENTRY_BYTES, GFP_KERNEL);
 	ret = -ENOMEM;
 	if (!pm.buffer)
 		goto out_mm;
 
 	src = *ppos;
 	svpfn = src / PM_ENTRY_BYTES;
-	end_vaddr = mm->task_size;
+
+	start_vaddr = svpfn << PAGE_SHIFT;
+
+	if (reset)
+	{
+		if (count < sizeof(end_vaddr))
+		{
+			ret = -EINVAL;
+			goto out_mm;
+		}
+		if (copy_from_user(&end_vaddr, buf, sizeof(end_vaddr)))
+			return -EFAULT;
+		end_vaddr = min(end_vaddr, mm->task_size);
+	}
+	else
+	{
+		end_vaddr = mm->task_size;
+		start_vaddr = end_vaddr;
+	}
 
 	/* watch out for wraparound */
-	start_vaddr = end_vaddr;
 	if (svpfn <= (ULONG_MAX >> PAGE_SHIFT))
 		start_vaddr = untagged_addr(svpfn << PAGE_SHIFT);
 
@@ -1701,18 +1835,34 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 		unsigned long end;
 
 		pm.pos = 0;
-		end = (start_vaddr + PAGEMAP_WALK_SIZE) & PAGEMAP_WALK_MASK;
+		pm.len = min(buffer_len, count / PM_ENTRY_BYTES);
+
+		end = reset ? end_vaddr : (start_vaddr + (pm.len << PAGE_SHIFT));
 		/* overflow ? */
 		if (end < start_vaddr || end > end_vaddr)
 			end = end_vaddr;
+
 		ret = mmap_read_lock_killable(mm);
 		if (ret)
 			goto out_free;
+
+		if (reset)
+		{
+			inc_tlb_flush_pending(mm);
+			mmu_notifier_range_init(&range, MMU_NOTIFY_SOFT_DIRTY,
+						0, NULL, mm, start_vaddr, end);
+			mmu_notifier_invalidate_range_start(&range);
+		}
 		ret = walk_page_range(mm, start_vaddr, end, &pagemap_ops, &pm);
+		if (reset)
+		{
+			mmu_notifier_invalidate_range_end(&range);
+			dec_tlb_flush_pending(mm);
+		}
 		mmap_read_unlock(mm);
-		start_vaddr = end;
 
 		len = min(count, PM_ENTRY_BYTES * pm.pos);
+		BUG_ON(ret && ret != PM_END_OF_BUFFER);
 		if (copy_to_user(buf, pm.buffer, len)) {
 			ret = -EFAULT;
 			goto out_free;
@@ -1720,6 +1870,8 @@ static ssize_t pagemap_read(struct file *file, char __user *buf,
 		copied += len;
 		buf += len;
 		count -= len;
+
+		start_vaddr = reset && pm.pos == pm.len ? ((unsigned long *)pm.buffer)[pm.pos - 1] + PAGE_SIZE : end;
 	}
 	*ppos += copied;
 	if (!ret || ret == PM_END_OF_BUFFER)
@@ -1733,6 +1885,18 @@ out:
 	return ret;
 }
 
+static ssize_t pagemap_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	return do_pagemap_read(file, buf, count, ppos, false);
+}
+
+static ssize_t pagemap_reset_read(struct file *file, char __user *buf,
+			    size_t count, loff_t *ppos)
+{
+	return do_pagemap_read(file, buf, count, ppos, true);
+}
+
 static int pagemap_open(struct inode *inode, struct file *file)
 {
 	struct mm_struct *mm;
@@ -1759,6 +1923,14 @@ const struct file_operations proc_pagemap_operations = {
 	.open		= pagemap_open,
 	.release	= pagemap_release,
 };
+
+const struct file_operations proc_pagemap_reset_operations = {
+	.llseek		= mem_lseek, /* borrow this */
+	.read		= pagemap_reset_read,
+	.open		= pagemap_open,
+	.release	= pagemap_release,
+};
+
 #endif /* CONFIG_PROC_PAGE_MONITOR */
 
 #ifdef CONFIG_NUMA