From 56873f43abdcd574b25105867a990f067747b2f4 Mon Sep 17 00:00:00 2001 From: "Wang, Yalin" Date: Wed, 11 Feb 2015 15:24:51 -0800 Subject: mm:add KPF_ZERO_PAGE flag for /proc/kpageflags Add KPF_ZERO_PAGE flag for zero_page, so that userspace processes can detect zero_page in /proc/kpageflags, and then do memory analysis more accurately. Signed-off-by: Yalin Wang Acked-by: Kirill A. Shutemov Cc: Konstantin Khlebnikov Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/page.c | 16 +++++++++++++--- 1 file changed, 13 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/page.c b/fs/proc/page.c index 1e3187da1fed..7eee2d8b97d9 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include #include @@ -121,9 +122,18 @@ u64 stable_page_flags(struct page *page) * just checks PG_head/PG_tail, so we need to check PageLRU/PageAnon * to make sure a given page is a thp, not a non-huge compound page. */ - else if (PageTransCompound(page) && (PageLRU(compound_head(page)) || - PageAnon(compound_head(page)))) - u |= 1 << KPF_THP; + else if (PageTransCompound(page)) { + struct page *head = compound_head(page); + + if (PageLRU(head) || PageAnon(head)) + u |= 1 << KPF_THP; + else if (is_huge_zero_page(head)) { + u |= 1 << KPF_ZERO_PAGE; + u |= 1 << KPF_THP; + } + } else if (is_zero_pfn(page_to_pfn(page))) + u |= 1 << KPF_ZERO_PAGE; + /* * Caveats on high order pages: page->_count will only be set -- cgit 1.4.1 From dc6c9a35b66b520cf67e05d8ca60ebecad3b0479 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 11 Feb 2015 15:26:50 -0800 Subject: mm: account pmd page tables to the process Dave noticed that unprivileged process can allocate significant amount of memory -- >500 MiB on x86_64 -- and stay unnoticed by oom-killer and memory cgroup. The trick is to allocate a lot of PMD page tables. Linux kernel doesn't account PMD tables to the process, only PTE. The use-cases below use few tricks to allocate a lot of PMD page tables while keeping VmRSS and VmPTE low. oom_score for the process will be 0. #include #include #include #include #include #include #define PUD_SIZE (1UL << 30) #define PMD_SIZE (1UL << 21) #define NR_PUD 130000 int main(void) { char *addr = NULL; unsigned long i; prctl(PR_SET_THP_DISABLE); for (i = 0; i < NR_PUD ; i++) { addr = mmap(addr + PUD_SIZE, PUD_SIZE, PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); if (addr == MAP_FAILED) { perror("mmap"); break; } *addr = 'x'; munmap(addr, PMD_SIZE); mmap(addr, PMD_SIZE, PROT_WRITE|PROT_READ, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0); if (addr == MAP_FAILED) perror("re-mmap"), exit(1); } printf("PID %d consumed %lu KiB in PMD page tables\n", getpid(), i * 4096 >> 10); return pause(); } The patch addresses the issue by account PMD tables to the process the same way we account PTE. The main place where PMD tables is accounted is __pmd_alloc() and free_pmd_range(). But there're few corner cases: - HugeTLB can share PMD page tables. The patch handles by accounting the table to all processes who share it. - x86 PAE pre-allocates few PMD tables on fork. - Architectures with FIRST_USER_ADDRESS > 0. We need to adjust sanity check on exit(2). Accounting only happens on configuration where PMD page table's level is present (PMD is not folded). As with nr_ptes we use per-mm counter. The counter value is used to calculate baseline for badness score by oom-killer. Signed-off-by: Kirill A. Shutemov Reported-by: Dave Hansen Cc: Hugh Dickins Reviewed-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: David Rientjes Tested-by: Sedat Dilek Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 12 ++++++------ arch/x86/mm/pgtable.c | 14 +++++++++----- fs/proc/task_mmu.c | 9 ++++++--- include/linux/mm.h | 24 ++++++++++++++++++++++++ include/linux/mm_types.h | 3 ++- kernel/fork.c | 3 +++ mm/debug.c | 3 ++- mm/hugetlb.c | 8 ++++++-- mm/memory.c | 15 +++++++++------ mm/mmap.c | 4 +++- mm/oom_kill.c | 9 +++++---- 11 files changed, 75 insertions(+), 29 deletions(-) (limited to 'fs') diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 4415aa915681..e9c706e4627a 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -555,12 +555,12 @@ this is causing problems for your system/application. oom_dump_tasks -Enables a system-wide task dump (excluding kernel threads) to be -produced when the kernel performs an OOM-killing and includes such -information as pid, uid, tgid, vm size, rss, nr_ptes, swapents, -oom_score_adj score, and name. This is helpful to determine why the -OOM killer was invoked, to identify the rogue task that caused it, -and to determine why the OOM killer chose the task it did to kill. +Enables a system-wide task dump (excluding kernel threads) to be produced +when the kernel performs an OOM-killing and includes such information as +pid, uid, tgid, vm size, rss, nr_ptes, nr_pmds, swapents, oom_score_adj +score, and name. This is helpful to determine why the OOM killer was +invoked, to identify the rogue task that caused it, and to determine why +the OOM killer chose the task it did to kill. If this is set to zero, this information is suppressed. On very large systems with thousands of tasks it may not be feasible to dump diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index 6fb6927f9e76..7b22adaad4f1 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -190,7 +190,7 @@ void pud_populate(struct mm_struct *mm, pud_t *pudp, pmd_t *pmd) #endif /* CONFIG_X86_PAE */ -static void free_pmds(pmd_t *pmds[]) +static void free_pmds(struct mm_struct *mm, pmd_t *pmds[]) { int i; @@ -198,10 +198,11 @@ static void free_pmds(pmd_t *pmds[]) if (pmds[i]) { pgtable_pmd_page_dtor(virt_to_page(pmds[i])); free_page((unsigned long)pmds[i]); + mm_dec_nr_pmds(mm); } } -static int preallocate_pmds(pmd_t *pmds[]) +static int preallocate_pmds(struct mm_struct *mm, pmd_t *pmds[]) { int i; bool failed = false; @@ -215,11 +216,13 @@ static int preallocate_pmds(pmd_t *pmds[]) pmd = NULL; failed = true; } + if (pmd) + mm_inc_nr_pmds(mm); pmds[i] = pmd; } if (failed) { - free_pmds(pmds); + free_pmds(mm, pmds); return -ENOMEM; } @@ -246,6 +249,7 @@ static void pgd_mop_up_pmds(struct mm_struct *mm, pgd_t *pgdp) paravirt_release_pmd(pgd_val(pgd) >> PAGE_SHIFT); pmd_free(mm, pmd); + mm_dec_nr_pmds(mm); } } } @@ -283,7 +287,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) mm->pgd = pgd; - if (preallocate_pmds(pmds) != 0) + if (preallocate_pmds(mm, pmds) != 0) goto out_free_pgd; if (paravirt_pgd_alloc(mm) != 0) @@ -304,7 +308,7 @@ pgd_t *pgd_alloc(struct mm_struct *mm) return pgd; out_free_pmds: - free_pmds(pmds); + free_pmds(mm, pmds); out_free_pgd: free_page((unsigned long)pgd); out: diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 6396f88c6687..e6e0abeb5d12 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -21,7 +21,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) { - unsigned long data, text, lib, swap; + unsigned long data, text, lib, swap, ptes, pmds; unsigned long hiwater_vm, total_vm, hiwater_rss, total_rss; /* @@ -42,6 +42,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; swap = get_mm_counter(mm, MM_SWAPENTS); + ptes = PTRS_PER_PTE * sizeof(pte_t) * atomic_long_read(&mm->nr_ptes); + pmds = PTRS_PER_PMD * sizeof(pmd_t) * mm_nr_pmds(mm); seq_printf(m, "VmPeak:\t%8lu kB\n" "VmSize:\t%8lu kB\n" @@ -54,6 +56,7 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) "VmExe:\t%8lu kB\n" "VmLib:\t%8lu kB\n" "VmPTE:\t%8lu kB\n" + "VmPMD:\t%8lu kB\n" "VmSwap:\t%8lu kB\n", hiwater_vm << (PAGE_SHIFT-10), total_vm << (PAGE_SHIFT-10), @@ -63,8 +66,8 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) total_rss << (PAGE_SHIFT-10), data << (PAGE_SHIFT-10), mm->stack_vm << (PAGE_SHIFT-10), text, lib, - (PTRS_PER_PTE * sizeof(pte_t) * - atomic_long_read(&mm->nr_ptes)) >> 10, + ptes >> 10, + pmds >> 10, swap << (PAGE_SHIFT-10)); } diff --git a/include/linux/mm.h b/include/linux/mm.h index c6bf813a6b3d..644990b83cda 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1438,8 +1438,32 @@ static inline int __pmd_alloc(struct mm_struct *mm, pud_t *pud, { return 0; } + +static inline unsigned long mm_nr_pmds(struct mm_struct *mm) +{ + return 0; +} + +static inline void mm_inc_nr_pmds(struct mm_struct *mm) {} +static inline void mm_dec_nr_pmds(struct mm_struct *mm) {} + #else int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address); + +static inline unsigned long mm_nr_pmds(struct mm_struct *mm) +{ + return atomic_long_read(&mm->nr_pmds); +} + +static inline void mm_inc_nr_pmds(struct mm_struct *mm) +{ + atomic_long_inc(&mm->nr_pmds); +} + +static inline void mm_dec_nr_pmds(struct mm_struct *mm) +{ + atomic_long_dec(&mm->nr_pmds); +} #endif int __pte_alloc(struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 20ff2105b564..199a03aab8dc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -363,7 +363,8 @@ struct mm_struct { pgd_t * pgd; atomic_t mm_users; /* How many users with user space? */ atomic_t mm_count; /* How many references to "struct mm_struct" (users count as 1) */ - atomic_long_t nr_ptes; /* Page table pages */ + atomic_long_t nr_ptes; /* PTE page table pages */ + atomic_long_t nr_pmds; /* PMD page table pages */ int map_count; /* number of VMAs */ spinlock_t page_table_lock; /* Protects page tables and some counters */ diff --git a/kernel/fork.c b/kernel/fork.c index b379d9abddc7..c99098c52641 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -555,6 +555,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p) INIT_LIST_HEAD(&mm->mmlist); mm->core_state = NULL; atomic_long_set(&mm->nr_ptes, 0); +#ifndef __PAGETABLE_PMD_FOLDED + atomic_long_set(&mm->nr_pmds, 0); +#endif mm->map_count = 0; mm->locked_vm = 0; mm->pinned_vm = 0; diff --git a/mm/debug.c b/mm/debug.c index d69cb5a7ba9a..3eb3ac2fcee7 100644 --- a/mm/debug.c +++ b/mm/debug.c @@ -173,7 +173,7 @@ void dump_mm(const struct mm_struct *mm) "get_unmapped_area %p\n" #endif "mmap_base %lu mmap_legacy_base %lu highest_vm_end %lu\n" - "pgd %p mm_users %d mm_count %d nr_ptes %lu map_count %d\n" + "pgd %p mm_users %d mm_count %d nr_ptes %lu nr_pmds %lu map_count %d\n" "hiwater_rss %lx hiwater_vm %lx total_vm %lx locked_vm %lx\n" "pinned_vm %lx shared_vm %lx exec_vm %lx stack_vm %lx\n" "start_code %lx end_code %lx start_data %lx end_data %lx\n" @@ -206,6 +206,7 @@ void dump_mm(const struct mm_struct *mm) mm->pgd, atomic_read(&mm->mm_users), atomic_read(&mm->mm_count), atomic_long_read((atomic_long_t *)&mm->nr_ptes), + mm_nr_pmds((struct mm_struct *)mm), mm->map_count, mm->hiwater_rss, mm->hiwater_vm, mm->total_vm, mm->locked_vm, mm->pinned_vm, mm->shared_vm, mm->exec_vm, mm->stack_vm, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index fd28d6ba5e5d..0a9ac6c26832 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3598,6 +3598,7 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) if (saddr) { spte = huge_pte_offset(svma->vm_mm, saddr); if (spte) { + mm_inc_nr_pmds(mm); get_page(virt_to_page(spte)); break; } @@ -3609,11 +3610,13 @@ pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud) ptl = huge_pte_lockptr(hstate_vma(vma), mm, spte); spin_lock(ptl); - if (pud_none(*pud)) + if (pud_none(*pud)) { pud_populate(mm, pud, (pmd_t *)((unsigned long)spte & PAGE_MASK)); - else + } else { put_page(virt_to_page(spte)); + mm_inc_nr_pmds(mm); + } spin_unlock(ptl); out: pte = (pte_t *)pmd_alloc(mm, pud, addr); @@ -3644,6 +3647,7 @@ int huge_pmd_unshare(struct mm_struct *mm, unsigned long *addr, pte_t *ptep) pud_clear(pud); put_page(virt_to_page(ptep)); + mm_dec_nr_pmds(mm); *addr = ALIGN(*addr, HPAGE_SIZE * PTRS_PER_PTE) - HPAGE_SIZE; return 1; } diff --git a/mm/memory.c b/mm/memory.c index d63849b5188f..bbe6a73a899d 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -428,6 +428,7 @@ static inline void free_pmd_range(struct mmu_gather *tlb, pud_t *pud, pmd = pmd_offset(pud, start); pud_clear(pud); pmd_free_tlb(tlb, pmd, start); + mm_dec_nr_pmds(tlb->mm); } static inline void free_pud_range(struct mmu_gather *tlb, pgd_t *pgd, @@ -3322,15 +3323,17 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address) spin_lock(&mm->page_table_lock); #ifndef __ARCH_HAS_4LEVEL_HACK - if (pud_present(*pud)) /* Another has populated it */ - pmd_free(mm, new); - else + if (!pud_present(*pud)) { + mm_inc_nr_pmds(mm); pud_populate(mm, pud, new); -#else - if (pgd_present(*pud)) /* Another has populated it */ + } else /* Another has populated it */ pmd_free(mm, new); - else +#else + if (!pgd_present(*pud)) { + mm_inc_nr_pmds(mm); pgd_populate(mm, pud, new); + } else /* Another has populated it */ + pmd_free(mm, new); #endif /* __ARCH_HAS_4LEVEL_HACK */ spin_unlock(&mm->page_table_lock); return 0; diff --git a/mm/mmap.c b/mm/mmap.c index 14d84666e8ba..6a7d36d133fb 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -2853,7 +2853,9 @@ void exit_mmap(struct mm_struct *mm) vm_unacct_memory(nr_accounted); WARN_ON(atomic_long_read(&mm->nr_ptes) > - (FIRST_USER_ADDRESS+PMD_SIZE-1)>>PMD_SHIFT); + round_up(FIRST_USER_ADDRESS, PMD_SIZE) >> PMD_SHIFT); + WARN_ON(mm_nr_pmds(mm) > + round_up(FIRST_USER_ADDRESS, PUD_SIZE) >> PUD_SHIFT); } /* Insert vm structure into process list sorted by address diff --git a/mm/oom_kill.c b/mm/oom_kill.c index b8df76ee2be3..642f38cb175a 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -169,8 +169,8 @@ unsigned long oom_badness(struct task_struct *p, struct mem_cgroup *memcg, * The baseline for the badness score is the proportion of RAM that each * task's rss, pagetable and swap space use. */ - points = get_mm_rss(p->mm) + atomic_long_read(&p->mm->nr_ptes) + - get_mm_counter(p->mm, MM_SWAPENTS); + points = get_mm_rss(p->mm) + get_mm_counter(p->mm, MM_SWAPENTS) + + atomic_long_read(&p->mm->nr_ptes) + mm_nr_pmds(p->mm); task_unlock(p); /* @@ -351,7 +351,7 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) struct task_struct *p; struct task_struct *task; - pr_info("[ pid ] uid tgid total_vm rss nr_ptes swapents oom_score_adj name\n"); + pr_info("[ pid ] uid tgid total_vm rss nr_ptes nr_pmds swapents oom_score_adj name\n"); rcu_read_lock(); for_each_process(p) { if (oom_unkillable_task(p, memcg, nodemask)) @@ -367,10 +367,11 @@ static void dump_tasks(struct mem_cgroup *memcg, const nodemask_t *nodemask) continue; } - pr_info("[%5d] %5d %5d %8lu %8lu %7ld %8lu %5hd %s\n", + pr_info("[%5d] %5d %5d %8lu %8lu %7ld %7ld %8lu %5hd %s\n", task->pid, from_kuid(&init_user_ns, task_uid(task)), task->tgid, task->mm->total_vm, get_mm_rss(task->mm), atomic_long_read(&task->mm->nr_ptes), + mm_nr_pmds(task->mm), get_mm_counter(task->mm, MM_SWAPENTS), task->signal->oom_score_adj, task->comm); task_unlock(task); -- cgit 1.4.1 From 8d38633c3b4093aca7524945f1e9249d7d3a44da Mon Sep 17 00:00:00 2001 From: Konstantin Khebnikov Date: Wed, 11 Feb 2015 15:26:55 -0800 Subject: page_writeback: put account_page_redirty() after set_page_dirty() Helper account_page_redirty() fixes dirty pages counter for redirtied pages. This patch puts it after dirtying and prevents temporary underflows of dirtied pages counters on zone/bdi and current->nr_dirtied. Signed-off-by: Konstantin Khebnikov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/btrfs/extent_io.c | 2 +- mm/page-writeback.c | 5 ++++- 2 files changed, 5 insertions(+), 2 deletions(-) (limited to 'fs') diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 790dbae3343c..c73df6a7c9b6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -1407,8 +1407,8 @@ int extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end) while (index <= end_index) { page = find_get_page(inode->i_mapping, index); BUG_ON(!page); /* Pages should be in the extent_io_tree */ - account_page_redirty(page); __set_page_dirty_nobuffers(page); + account_page_redirty(page); page_cache_release(page); index++; } diff --git a/mm/page-writeback.c b/mm/page-writeback.c index fb71e9deca85..6a73e47e81c6 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -2168,9 +2168,12 @@ EXPORT_SYMBOL(account_page_redirty); */ int redirty_page_for_writepage(struct writeback_control *wbc, struct page *page) { + int ret; + wbc->pages_skipped++; + ret = __set_page_dirty_nobuffers(page); account_page_redirty(page); - return __set_page_dirty_nobuffers(page); + return ret; } EXPORT_SYMBOL(redirty_page_for_writepage); -- cgit 1.4.1 From 05fbf357d94152171bc50f8a369390f1f16efd89 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 11 Feb 2015 15:27:31 -0800 Subject: proc/pagemap: walk page tables under pte lock Lockless access to pte in pagemap_pte_range() might race with page migration and trigger BUG_ON(!PageLocked()) in migration_entry_to_page(): CPU A (pagemap) CPU B (migration) lock_page() try_to_unmap(page, TTU_MIGRATION...) make_migration_entry() set_pte_at() pte_to_pagemap_entry() remove_migration_ptes() unlock_page() if(is_migration_entry()) migration_entry_to_page() BUG_ON(!PageLocked(page)) Also lockless read might be non-atomic if pte is larger than wordsize. Other pte walkers (smaps, numa_maps, clear_refs) already lock ptes. Fixes: 052fb0d635df ("proc: report file/anon bit in /proc/pid/pagemap") Signed-off-by: Konstantin Khlebnikov Reported-by: Andrey Ryabinin Reviewed-by: Cyrill Gorcunov Acked-by: Naoya Horiguchi Acked-by: Kirill A. Shutemov Cc: [3.5+] Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index e6e0abeb5d12..eeab30fcffcc 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1056,7 +1056,7 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct vm_area_struct *vma; struct pagemapread *pm = walk->private; spinlock_t *ptl; - pte_t *pte; + pte_t *pte, *orig_pte; int err = 0; /* find the first VMA at or above 'addr' */ @@ -1117,15 +1117,19 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, BUG_ON(is_vm_hugetlb_page(vma)); /* Addresses in the VMA. */ - for (; addr < min(end, vma->vm_end); addr += PAGE_SIZE) { + orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + for (; addr < min(end, vma->vm_end); pte++, addr += PAGE_SIZE) { pagemap_entry_t pme; - pte = pte_offset_map(pmd, addr); + pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); - pte_unmap(pte); err = add_to_pagemap(addr, &pme, pm); if (err) - return err; + break; } + pte_unmap_unlock(orig_pte, ptl); + + if (err) + return err; if (addr == end) break; -- cgit 1.4.1 From 14eb6fdd4204d215a14ecd9f84a1ca66faabcc4d Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Feb 2015 15:27:43 -0800 Subject: smaps: remove mem_size_stats->vma and use walk_page_vma() pagewalk.c can handle vma in itself, so we don't have to pass vma via walk->private. And show_smap() walks pages on vma basis, so using walk_page_vma() is preferable. Signed-off-by: Naoya Horiguchi Acked-by: Kirill A. Shutemov Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: Pavel Emelyanov Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index eeab30fcffcc..73425398b38c 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -436,7 +436,6 @@ const struct file_operations proc_tid_maps_operations = { #ifdef CONFIG_PROC_PAGE_MONITOR struct mem_size_stats { - struct vm_area_struct *vma; unsigned long resident; unsigned long shared_clean; unsigned long shared_dirty; @@ -485,7 +484,7 @@ static void smaps_pte_entry(pte_t *pte, unsigned long addr, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; - struct vm_area_struct *vma = mss->vma; + struct vm_area_struct *vma = walk->vma; struct page *page = NULL; if (pte_present(*pte)) { @@ -509,7 +508,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, struct mm_walk *walk) { struct mem_size_stats *mss = walk->private; - struct vm_area_struct *vma = mss->vma; + struct vm_area_struct *vma = walk->vma; struct page *page; /* FOLL_DUMP will return -EFAULT on huge zero page */ @@ -530,8 +529,7 @@ static void smaps_pmd_entry(pmd_t *pmd, unsigned long addr, static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct mem_size_stats *mss = walk->private; - struct vm_area_struct *vma = mss->vma; + struct vm_area_struct *vma = walk->vma; pte_t *pte; spinlock_t *ptl; @@ -623,10 +621,8 @@ static int show_smap(struct seq_file *m, void *v, int is_pid) }; memset(&mss, 0, sizeof mss); - mss.vma = vma; /* mmap_sem is held in m_start */ - if (vma->vm_mm && !is_vm_hugetlb_page(vma)) - walk_page_range(vma->vm_start, vma->vm_end, &smaps_walk); + walk_page_vma(vma, &smaps_walk); show_map_vma(m, vma, is_pid); -- cgit 1.4.1 From 5c64f52acdbc615e3ef58692f42ee00b83d0225d Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Feb 2015 15:27:46 -0800 Subject: clear_refs: remove clear_refs_private->vma and introduce clear_refs_test_walk() clear_refs_write() has some prechecks to determine if we really walk over a given vma. Now we have a test_walk() callback to filter vmas, so let's utilize it. Signed-off-by: Naoya Horiguchi Acked-by: Kirill A. Shutemov Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: Pavel Emelyanov Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 46 ++++++++++++++++++++++------------------------ 1 file changed, 22 insertions(+), 24 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 73425398b38c..bed0834715a5 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -736,7 +736,6 @@ enum clear_refs_types { }; struct clear_refs_private { - struct vm_area_struct *vma; enum clear_refs_types type; }; @@ -767,7 +766,7 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct clear_refs_private *cp = walk->private; - struct vm_area_struct *vma = cp->vma; + struct vm_area_struct *vma = walk->vma; pte_t *pte, ptent; spinlock_t *ptl; struct page *page; @@ -801,6 +800,25 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, return 0; } +static int clear_refs_test_walk(unsigned long start, unsigned long end, + struct mm_walk *walk) +{ + struct clear_refs_private *cp = walk->private; + struct vm_area_struct *vma = walk->vma; + + /* + * Writing 1 to /proc/pid/clear_refs affects all pages. + * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. + * Writing 3 to /proc/pid/clear_refs only affects file mapped pages. + * Writing 4 to /proc/pid/clear_refs affects all pages. + */ + if (cp->type == CLEAR_REFS_ANON && vma->vm_file) + return 1; + if (cp->type == CLEAR_REFS_MAPPED && !vma->vm_file) + return 1; + return 0; +} + static ssize_t clear_refs_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) { @@ -841,6 +859,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, }; struct mm_walk clear_refs_walk = { .pmd_entry = clear_refs_pte_range, + .test_walk = clear_refs_test_walk, .mm = mm, .private = &cp, }; @@ -860,28 +879,7 @@ static ssize_t clear_refs_write(struct file *file, const char __user *buf, } mmu_notifier_invalidate_range_start(mm, 0, -1); } - for (vma = mm->mmap; vma; vma = vma->vm_next) { - cp.vma = vma; - if (is_vm_hugetlb_page(vma)) - continue; - /* - * Writing 1 to /proc/pid/clear_refs affects all pages. - * - * Writing 2 to /proc/pid/clear_refs only affects - * Anonymous pages. - * - * Writing 3 to /proc/pid/clear_refs only affects file - * mapped pages. - * - * Writing 4 to /proc/pid/clear_refs affects all pages. - */ - if (type == CLEAR_REFS_ANON && vma->vm_file) - continue; - if (type == CLEAR_REFS_MAPPED && !vma->vm_file) - continue; - walk_page_range(vma->vm_start, vma->vm_end, - &clear_refs_walk); - } + walk_page_range(0, ~0UL, &clear_refs_walk); if (type == CLEAR_REFS_SOFT_DIRTY) mmu_notifier_invalidate_range_end(mm, 0, -1); flush_tlb_mm(mm); -- cgit 1.4.1 From f995ece24dfecb3614468befbe4e6e777b854cc0 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Feb 2015 15:27:48 -0800 Subject: pagemap: use walk->vma instead of calling find_vma() Page table walker has the information of the current vma in mm_walk, so we don't have to call find_vma() in each pagemap_(pte|hugetlb)_range() call any longer. Currently pagemap_pte_range() does vma loop itself, so this patch reduces many lines of code. NULL-vma check is omitted because we assume that we never run these callbacks on any address outside vma. And even if it were broken, NULL pointer dereference would be detected, so we can get enough information for debugging. Signed-off-by: Naoya Horiguchi Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: Kirill A. Shutemov Cc: Pavel Emelyanov Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 68 +++++++++++------------------------------------------- 1 file changed, 14 insertions(+), 54 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index bed0834715a5..4206706dd92a 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1047,15 +1047,13 @@ static inline void thp_pmd_to_pagemap_entry(pagemap_entry_t *pme, struct pagemap static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct vm_area_struct *vma; + struct vm_area_struct *vma = walk->vma; struct pagemapread *pm = walk->private; spinlock_t *ptl; pte_t *pte, *orig_pte; int err = 0; - /* find the first VMA at or above 'addr' */ - vma = find_vma(walk->mm, addr); - if (vma && pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { int pmd_flags2; if ((vma->vm_flags & VM_SOFTDIRTY) || pmd_soft_dirty(*pmd)) @@ -1081,55 +1079,20 @@ static int pagemap_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, if (pmd_trans_unstable(pmd)) return 0; - while (1) { - /* End of address space hole, which we mark as non-present. */ - unsigned long hole_end; - - if (vma) - hole_end = min(end, vma->vm_start); - else - hole_end = end; - - for (; addr < hole_end; addr += PAGE_SIZE) { - pagemap_entry_t pme = make_pme(PM_NOT_PRESENT(pm->v2)); - - err = add_to_pagemap(addr, &pme, pm); - if (err) - return err; - } - - if (!vma || vma->vm_start >= end) - break; - /* - * We can't possibly be in a hugetlb VMA. In general, - * for a mm_walk with a pmd_entry and a hugetlb_entry, - * the pmd_entry can only be called on addresses in a - * hugetlb if the walk starts in a non-hugetlb VMA and - * spans a hugepage VMA. Since pagemap_read walks are - * PMD-sized and PMD-aligned, this will never be true. - */ - BUG_ON(is_vm_hugetlb_page(vma)); - - /* Addresses in the VMA. */ - orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); - for (; addr < min(end, vma->vm_end); pte++, addr += PAGE_SIZE) { - pagemap_entry_t pme; - - pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); - err = add_to_pagemap(addr, &pme, pm); - if (err) - break; - } - pte_unmap_unlock(orig_pte, ptl); + /* + * We can assume that @vma always points to a valid one and @end never + * goes beyond vma->vm_end. + */ + orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + for (; addr < end; pte++, addr += PAGE_SIZE) { + pagemap_entry_t pme; + pte_to_pagemap_entry(&pme, pm, vma, addr, *pte); + err = add_to_pagemap(addr, &pme, pm); if (err) - return err; - - if (addr == end) break; - - vma = find_vma(walk->mm, addr); } + pte_unmap_unlock(orig_pte, ptl); cond_resched(); @@ -1155,15 +1118,12 @@ static int pagemap_hugetlb_range(pte_t *pte, unsigned long hmask, struct mm_walk *walk) { struct pagemapread *pm = walk->private; - struct vm_area_struct *vma; + struct vm_area_struct *vma = walk->vma; int err = 0; int flags2; pagemap_entry_t pme; - vma = find_vma(walk->mm, addr); - WARN_ON_ONCE(!vma); - - if (vma && (vma->vm_flags & VM_SOFTDIRTY)) + if (vma->vm_flags & VM_SOFTDIRTY) flags2 = __PM_SOFT_DIRTY; else flags2 = 0; -- cgit 1.4.1 From 632fd60fe46f9159f059ed7612eb529e475302a9 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Feb 2015 15:27:51 -0800 Subject: numa_maps: fix typo in gather_hugetbl_stats Just doing s/gather_hugetbl_stats/gather_hugetlb_stats/g, this makes code grep-friendly. Signed-off-by: Naoya Horiguchi Acked-by: Kirill A. Shutemov Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: Pavel Emelyanov Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4206706dd92a..ae4bc2960077 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1385,7 +1385,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, return 0; } #ifdef CONFIG_HUGETLB_PAGE -static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, +static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { struct numa_maps *md; @@ -1404,7 +1404,7 @@ static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, } #else -static int gather_hugetbl_stats(pte_t *pte, unsigned long hmask, +static int gather_hugetlb_stats(pte_t *pte, unsigned long hmask, unsigned long addr, unsigned long end, struct mm_walk *walk) { return 0; @@ -1435,7 +1435,7 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) md->vma = vma; - walk.hugetlb_entry = gather_hugetbl_stats; + walk.hugetlb_entry = gather_hugetlb_stats; walk.pmd_entry = gather_pte_stats; walk.private = md; walk.mm = mm; -- cgit 1.4.1 From d85f4d6d3bfe3b82e2903ac51a2f837eab7115d7 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Feb 2015 15:27:54 -0800 Subject: numa_maps: remove numa_maps->vma pagewalk.c can handle vma in itself, so we don't have to pass vma via walk->private. And show_numa_map() walks pages on vma basis, so using walk_page_vma() is preferable. Signed-off-by: Naoya Horiguchi Acked-by: Kirill A. Shutemov Cc: "Kirill A. Shutemov" Cc: Andrea Arcangeli Cc: Cyrill Gorcunov Cc: Dave Hansen Cc: Pavel Emelyanov Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index ae4bc2960077..a36db4ad140b 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1283,7 +1283,6 @@ const struct file_operations proc_pagemap_operations = { #ifdef CONFIG_NUMA struct numa_maps { - struct vm_area_struct *vma; unsigned long pages; unsigned long anon; unsigned long active; @@ -1352,18 +1351,17 @@ static struct page *can_gather_numa_stats(pte_t pte, struct vm_area_struct *vma, static int gather_pte_stats(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { - struct numa_maps *md; + struct numa_maps *md = walk->private; + struct vm_area_struct *vma = walk->vma; spinlock_t *ptl; pte_t *orig_pte; pte_t *pte; - md = walk->private; - - if (pmd_trans_huge_lock(pmd, md->vma, &ptl) == 1) { + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { pte_t huge_pte = *(pte_t *)pmd; struct page *page; - page = can_gather_numa_stats(huge_pte, md->vma, addr); + page = can_gather_numa_stats(huge_pte, vma, addr); if (page) gather_stats(page, md, pte_dirty(huge_pte), HPAGE_PMD_SIZE/PAGE_SIZE); @@ -1375,7 +1373,7 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, return 0; orig_pte = pte = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); do { - struct page *page = can_gather_numa_stats(*pte, md->vma, addr); + struct page *page = can_gather_numa_stats(*pte, vma, addr); if (!page) continue; gather_stats(page, md, pte_dirty(*pte), 1); @@ -1422,7 +1420,12 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) struct numa_maps *md = &numa_priv->md; struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; - struct mm_walk walk = {}; + struct mm_walk walk = { + .hugetlb_entry = gather_hugetlb_stats, + .pmd_entry = gather_pte_stats, + .private = md, + .mm = mm, + }; struct mempolicy *pol; char buffer[64]; int nid; @@ -1433,13 +1436,6 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) /* Ensure we start with an empty set of numa_maps statistics. */ memset(md, 0, sizeof(*md)); - md->vma = vma; - - walk.hugetlb_entry = gather_hugetlb_stats; - walk.pmd_entry = gather_pte_stats; - walk.private = md; - walk.mm = mm; - pol = __get_vma_policy(vma, vma->vm_start); if (pol) { mpol_to_str(buffer, sizeof(buffer), pol); @@ -1473,7 +1469,8 @@ static int show_numa_map(struct seq_file *m, void *v, int is_pid) if (is_vm_hugetlb_page(vma)) seq_puts(m, " huge"); - walk_page_range(vma->vm_start, vma->vm_end, &walk); + /* mmap_sem is held by m_start */ + walk_page_vma(vma, &walk); if (!md->pages) goto out; -- cgit 1.4.1 From 48684a65b4e3ff544d62532c1b78962c9677b632 Mon Sep 17 00:00:00 2001 From: Naoya Horiguchi Date: Wed, 11 Feb 2015 15:28:06 -0800 Subject: mm: pagewalk: fix misbehavior of walk_page_range for vma(VM_PFNMAP) walk_page_range() silently skips vma having VM_PFNMAP set, which leads to undesirable behaviour at client end (who called walk_page_range). For example for pagemap_read(), when no callbacks are called against VM_PFNMAP vma, pagemap_read() may prepare pagemap data for next virtual address range at wrong index. That could confuse and/or break userspace applications. This patch avoid this misbehavior caused by vma(VM_PFNMAP) like follows: - for pagemap_read() which has its own ->pte_hole(), call the ->pte_hole() over vma(VM_PFNMAP), - for clear_refs and queue_pages which have their own ->tests_walk, just return 1 and skip vma(VM_PFNMAP). This is no problem because these are not interested in hole regions, - for other callers, just skip the vma(VM_PFNMAP) as a default behavior. Signed-off-by: Naoya Horiguchi Signed-off-by: Shiraz Hashim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 3 +++ mm/mempolicy.c | 3 +++ mm/pagewalk.c | 21 +++++++++++++-------- 3 files changed, 19 insertions(+), 8 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index a36db4ad140b..f5ca96524f5f 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -806,6 +806,9 @@ static int clear_refs_test_walk(unsigned long start, unsigned long end, struct clear_refs_private *cp = walk->private; struct vm_area_struct *vma = walk->vma; + if (vma->vm_flags & VM_PFNMAP) + return 1; + /* * Writing 1 to /proc/pid/clear_refs affects all pages. * Writing 2 to /proc/pid/clear_refs only affects anonymous pages. diff --git a/mm/mempolicy.c b/mm/mempolicy.c index b1dcd11d867a..f1bd23803576 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -591,6 +591,9 @@ static int queue_pages_test_walk(unsigned long start, unsigned long end, unsigned long endvma = vma->vm_end; unsigned long flags = qp->flags; + if (vma->vm_flags & VM_PFNMAP) + return 1; + if (endvma > end) endvma = end; if (vma->vm_start > start) diff --git a/mm/pagewalk.c b/mm/pagewalk.c index 4c9a653ba563..75c1f2878519 100644 --- a/mm/pagewalk.c +++ b/mm/pagewalk.c @@ -35,7 +35,7 @@ static int walk_pmd_range(pud_t *pud, unsigned long addr, unsigned long end, do { again: next = pmd_addr_end(addr, end); - if (pmd_none(*pmd)) { + if (pmd_none(*pmd) || !walk->vma) { if (walk->pte_hole) err = walk->pte_hole(addr, next, walk); if (err) @@ -165,9 +165,6 @@ static int walk_hugetlb_range(unsigned long addr, unsigned long end, * or skip it via the returned value. Return 0 if we do walk over the * current vma, and return 1 if we skip the vma. Negative values means * error, where we abort the current walk. - * - * Default check (only VM_PFNMAP check for now) is used when the caller - * doesn't define test_walk() callback. */ static int walk_page_test(unsigned long start, unsigned long end, struct mm_walk *walk) @@ -178,11 +175,19 @@ static int walk_page_test(unsigned long start, unsigned long end, return walk->test_walk(start, end, walk); /* - * Do not walk over vma(VM_PFNMAP), because we have no valid struct - * page backing a VM_PFNMAP range. See also commit a9ff785e4437. + * vma(VM_PFNMAP) doesn't have any valid struct pages behind VM_PFNMAP + * range, so we don't walk over it as we do for normal vmas. However, + * Some callers are interested in handling hole range and they don't + * want to just ignore any single address range. Such users certainly + * define their ->pte_hole() callbacks, so let's delegate them to handle + * vma(VM_PFNMAP). */ - if (vma->vm_flags & VM_PFNMAP) - return 1; + if (vma->vm_flags & VM_PFNMAP) { + int err = 1; + if (walk->pte_hole) + err = walk->pte_hole(start, end, walk); + return err ? err : 1; + } return 0; } -- cgit 1.4.1 From 7d5b3bfaa2da150ce2dc45546f2125b854f962ef Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 11 Feb 2015 15:28:08 -0800 Subject: mm: /proc/pid/clear_refs: avoid split_huge_page() Currently pagewalker splits all THP pages on any clear_refs request. It's not necessary. We can handle this on PMD level. One side effect is that soft dirty will potentially see more dirty memory, since we will mark whole THP page dirty at once. Sanity checked with CRIU test suite. More testing is required. Signed-off-by: Kirill A. Shutemov Signed-off-by: Naoya Horiguchi Reviewed-by: Cyrill Gorcunov Cc: Pavel Emelyanov Cc: Andrea Arcangeli Cc: Dave Hansen Cc: "Kirill A. Shutemov" Cc: Benjamin Herrenschmidt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 47 ++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 44 insertions(+), 3 deletions(-) (limited to 'fs') diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f5ca96524f5f..0e36c1e49fe3 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -739,10 +739,10 @@ struct clear_refs_private { enum clear_refs_types type; }; +#ifdef CONFIG_MEM_SOFT_DIRTY static inline void clear_soft_dirty(struct vm_area_struct *vma, unsigned long addr, pte_t *pte) { -#ifdef CONFIG_MEM_SOFT_DIRTY /* * The soft-dirty tracker uses #PF-s to catch writes * to pages, so write-protect the pte as well. See the @@ -759,9 +759,35 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, } set_pte_at(vma->vm_mm, addr, pte, ptent); -#endif } +static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ + pmd_t pmd = *pmdp; + + pmd = pmd_wrprotect(pmd); + pmd = pmd_clear_flags(pmd, _PAGE_SOFT_DIRTY); + + if (vma->vm_flags & VM_SOFTDIRTY) + vma->vm_flags &= ~VM_SOFTDIRTY; + + set_pmd_at(vma->vm_mm, addr, pmdp, pmd); +} + +#else + +static inline void clear_soft_dirty(struct vm_area_struct *vma, + unsigned long addr, pte_t *pte) +{ +} + +static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, + unsigned long addr, pmd_t *pmdp) +{ +} +#endif + static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, struct mm_walk *walk) { @@ -771,7 +797,22 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, spinlock_t *ptl; struct page *page; - split_huge_page_pmd(vma, addr, pmd); + if (pmd_trans_huge_lock(pmd, vma, &ptl) == 1) { + if (cp->type == CLEAR_REFS_SOFT_DIRTY) { + clear_soft_dirty_pmd(vma, addr, pmd); + goto out; + } + + page = pmd_page(*pmd); + + /* Clear accessed and referenced bits. */ + pmdp_test_and_clear_young(vma, addr, pmd); + ClearPageReferenced(page); +out: + spin_unlock(ptl); + return 0; + } + if (pmd_trans_unstable(pmd)) return 0; -- cgit 1.4.1