summary refs log tree commit diff
path: root/mm/huge_memory.c
diff options
context:
space:
mode:
authorDavid Rientjes <rientjes@google.com>2012-10-08 16:34:03 -0700
committerLinus Torvalds <torvalds@linux-foundation.org>2012-10-09 16:23:02 +0900
commitb676b293fb48672904ee1b9828cb50b4eed01717 (patch)
tree22b2dcc1623da40a5ddfaf6db2bc5ab1c2476ddb /mm/huge_memory.c
parente90bdb7f52f94204c78fb40b0804645defdebd71 (diff)
downloadlinux-b676b293fb48672904ee1b9828cb50b4eed01717.tar.gz
mm, thp: fix mapped pages avoiding unevictable list on mlock
When a transparent hugepage is mapped and it is included in an mlock()
range, follow_page() incorrectly avoids setting the page's mlock bit and
moving it to the unevictable lru.

This is evident if you try to mlock(), munlock(), and then mlock() a
range again.  Currently:

	#define MAP_SIZE	(4 << 30)	/* 4GB */

	void *ptr = mmap(NULL, MAP_SIZE, PROT_READ | PROT_WRITE,
			 MAP_PRIVATE | MAP_ANONYMOUS, 0, 0);
	mlock(ptr, MAP_SIZE);

		$ grep -E "Unevictable|Inactive\(anon" /proc/meminfo
		Inactive(anon):     6304 kB
		Unevictable:     4213924 kB

	munlock(ptr, MAP_SIZE);

		Inactive(anon):  4186252 kB
		Unevictable:       19652 kB

	mlock(ptr, MAP_SIZE);

		Inactive(anon):  4198556 kB
		Unevictable:       21684 kB

Notice that less than 2MB was added to the unevictable list; this is
because these pages in the range are not transparent hugepages since the
4GB range was allocated with mmap() and has no specific alignment.  If
posix_memalign() were used instead, unevictable would not have grown at
all on the second mlock().

The fix is to call mlock_vma_page() so that the mlock bit is set and the
page is added to the unevictable list.  With this patch:

	mlock(ptr, MAP_SIZE);

		Inactive(anon):     4056 kB
		Unevictable:     4213940 kB

	munlock(ptr, MAP_SIZE);

		Inactive(anon):  4198268 kB
		Unevictable:       19636 kB

	mlock(ptr, MAP_SIZE);

		Inactive(anon):     4008 kB
		Unevictable:     4213940 kB

Signed-off-by: David Rientjes <rientjes@google.com>
Acked-by: Hugh Dickins <hughd@google.com>
Reviewed-by: Andrea Arcangeli <aarcange@redhat.com>
Cc: Naoya Horiguchi <n-horiguchi@ah.jp.nec.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Michel Lespinasse <walken@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
Diffstat (limited to 'mm/huge_memory.c')
-rw-r--r--mm/huge_memory.c11
1 files changed, 10 insertions, 1 deletions
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 08a943b9cf95..3a8d6b7d95db 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -971,11 +971,12 @@ out_unlock:
 	return ret;
 }
 
-struct page *follow_trans_huge_pmd(struct mm_struct *mm,
+struct page *follow_trans_huge_pmd(struct vm_area_struct *vma,
 				   unsigned long addr,
 				   pmd_t *pmd,
 				   unsigned int flags)
 {
+	struct mm_struct *mm = vma->vm_mm;
 	struct page *page = NULL;
 
 	assert_spin_locked(&mm->page_table_lock);
@@ -998,6 +999,14 @@ struct page *follow_trans_huge_pmd(struct mm_struct *mm,
 		_pmd = pmd_mkyoung(pmd_mkdirty(*pmd));
 		set_pmd_at(mm, addr & HPAGE_PMD_MASK, pmd, _pmd);
 	}
+	if ((flags & FOLL_MLOCK) && (vma->vm_flags & VM_LOCKED)) {
+		if (page->mapping && trylock_page(page)) {
+			lru_add_drain();
+			if (page->mapping)
+				mlock_vma_page(page);
+			unlock_page(page);
+		}
+	}
 	page += (addr & ~HPAGE_PMD_MASK) >> PAGE_SHIFT;
 	VM_BUG_ON(!PageCompound(page));
 	if (flags & FOLL_GET)