diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/compaction.c | 2 | ||||
-rw-r--r-- | mm/debug_vm_pgtable.c | 4 | ||||
-rw-r--r-- | mm/gup.c | 4 | ||||
-rw-r--r-- | mm/huge_memory.c | 56 | ||||
-rw-r--r-- | mm/hugetlb.c | 157 | ||||
-rw-r--r-- | mm/internal.h | 73 | ||||
-rw-r--r-- | mm/ioremap.c | 6 | ||||
-rw-r--r-- | mm/kasan/common.c | 4 | ||||
-rw-r--r-- | mm/kasan/hw_tags.c | 32 | ||||
-rw-r--r-- | mm/kasan/init.c | 4 | ||||
-rw-r--r-- | mm/kasan/sw_tags.c | 7 | ||||
-rw-r--r-- | mm/kfence/core.c | 6 | ||||
-rw-r--r-- | mm/ksm.c | 3 | ||||
-rw-r--r-- | mm/memfd.c | 4 | ||||
-rw-r--r-- | mm/memory-failure.c | 119 | ||||
-rw-r--r-- | mm/memory.c | 45 | ||||
-rw-r--r-- | mm/mempool.c | 6 | ||||
-rw-r--r-- | mm/migrate.c | 1 | ||||
-rw-r--r-- | mm/mlock.c | 22 | ||||
-rw-r--r-- | mm/mmap.c | 4 | ||||
-rw-r--r-- | mm/page_alloc.c | 74 | ||||
-rw-r--r-- | mm/page_vma_mapped.c | 160 | ||||
-rw-r--r-- | mm/pgtable-generic.c | 5 | ||||
-rw-r--r-- | mm/rmap.c | 39 | ||||
-rw-r--r-- | mm/shmem.c | 44 | ||||
-rw-r--r-- | mm/shuffle.h | 4 | ||||
-rw-r--r-- | mm/slab_common.c | 13 | ||||
-rw-r--r-- | mm/slub.c | 47 | ||||
-rw-r--r-- | mm/sparse.c | 13 | ||||
-rw-r--r-- | mm/swapfile.c | 2 | ||||
-rw-r--r-- | mm/truncate.c | 43 | ||||
-rw-r--r-- | mm/userfaultfd.c | 28 | ||||
-rw-r--r-- | mm/vmalloc.c | 41 |
33 files changed, 705 insertions, 367 deletions
diff --git a/mm/compaction.c b/mm/compaction.c index 84fde270ae74..725f564a5664 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1955,7 +1955,7 @@ static inline bool is_via_compact_memory(int order) static bool kswapd_is_running(pg_data_t *pgdat) { - return pgdat->kswapd && (pgdat->kswapd->state == TASK_RUNNING); + return pgdat->kswapd && task_is_running(pgdat->kswapd); } /* diff --git a/mm/debug_vm_pgtable.c b/mm/debug_vm_pgtable.c index 05efe98a9ac2..297d1b349c19 100644 --- a/mm/debug_vm_pgtable.c +++ b/mm/debug_vm_pgtable.c @@ -192,7 +192,7 @@ static void __init pmd_advanced_tests(struct mm_struct *mm, pr_debug("Validating PMD advanced\n"); /* Align the address wrt HPAGE_PMD_SIZE */ - vaddr = (vaddr & HPAGE_PMD_MASK) + HPAGE_PMD_SIZE; + vaddr &= HPAGE_PMD_MASK; pgtable_trans_huge_deposit(mm, pmdp, pgtable); @@ -330,7 +330,7 @@ static void __init pud_advanced_tests(struct mm_struct *mm, pr_debug("Validating PUD advanced\n"); /* Align the address wrt HPAGE_PUD_SIZE */ - vaddr = (vaddr & HPAGE_PUD_MASK) + HPAGE_PUD_SIZE; + vaddr &= HPAGE_PUD_MASK; set_pud_at(mm, vaddr, pudp, pud); pudp_set_wrprotect(mm, vaddr, pudp); diff --git a/mm/gup.c b/mm/gup.c index 0697134b6a12..3ded6a5f26b2 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1593,10 +1593,6 @@ struct page *get_dump_page(unsigned long addr) FOLL_FORCE | FOLL_DUMP | FOLL_GET); if (locked) mmap_read_unlock(mm); - - if (ret == 1 && is_page_poisoned(page)) - return NULL; - return (ret == 1) ? page : NULL; } #endif /* CONFIG_ELF_CORE */ diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 63ed6b25deaa..6d2a0119fc58 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -62,6 +62,7 @@ static struct shrinker deferred_split_shrinker; static atomic_t huge_zero_refcount; struct page *huge_zero_page __read_mostly; +unsigned long huge_zero_pfn __read_mostly = ~0UL; bool transparent_hugepage_enabled(struct vm_area_struct *vma) { @@ -98,6 +99,7 @@ retry: __free_pages(zero_page, compound_order(zero_page)); goto retry; } + WRITE_ONCE(huge_zero_pfn, page_to_pfn(zero_page)); /* We take additional reference here. It will be put back by shrinker */ atomic_set(&huge_zero_refcount, 2); @@ -147,6 +149,7 @@ static unsigned long shrink_huge_zero_page_scan(struct shrinker *shrink, if (atomic_cmpxchg(&huge_zero_refcount, 1, 0) == 1) { struct page *zero_page = xchg(&huge_zero_page, NULL); BUG_ON(zero_page == NULL); + WRITE_ONCE(huge_zero_pfn, ~0UL); __free_pages(zero_page, compound_order(zero_page)); return HPAGE_PMD_NR; } @@ -2044,7 +2047,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, count_vm_event(THP_SPLIT_PMD); if (!vma_is_anonymous(vma)) { - _pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); + old_pmd = pmdp_huge_clear_flush_notify(vma, haddr, pmd); /* * We are going to unmap this huge page. So * just go ahead and zap it @@ -2053,16 +2056,25 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, zap_deposited_table(mm, pmd); if (vma_is_special_huge(vma)) return; - page = pmd_page(_pmd); - if (!PageDirty(page) && pmd_dirty(_pmd)) - set_page_dirty(page); - if (!PageReferenced(page) && pmd_young(_pmd)) - SetPageReferenced(page); - page_remove_rmap(page, true); - put_page(page); + if (unlikely(is_pmd_migration_entry(old_pmd))) { + swp_entry_t entry; + + entry = pmd_to_swp_entry(old_pmd); + page = migration_entry_to_page(entry); + } else { + page = pmd_page(old_pmd); + if (!PageDirty(page) && pmd_dirty(old_pmd)) + set_page_dirty(page); + if (!PageReferenced(page) && pmd_young(old_pmd)) + SetPageReferenced(page); + page_remove_rmap(page, true); + put_page(page); + } add_mm_counter(mm, mm_counter_file(page), -HPAGE_PMD_NR); return; - } else if (pmd_trans_huge(*pmd) && is_huge_zero_pmd(*pmd)) { + } + + if (is_huge_zero_pmd(*pmd)) { /* * FIXME: Do we want to invalidate secondary mmu by calling * mmu_notifier_invalidate_range() see comments below inside @@ -2338,17 +2350,17 @@ void vma_adjust_trans_huge(struct vm_area_struct *vma, static void unmap_page(struct page *page) { - enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | + enum ttu_flags ttu_flags = TTU_IGNORE_MLOCK | TTU_SYNC | TTU_RMAP_LOCKED | TTU_SPLIT_HUGE_PMD; - bool unmap_success; VM_BUG_ON_PAGE(!PageHead(page), page); if (PageAnon(page)) ttu_flags |= TTU_SPLIT_FREEZE; - unmap_success = try_to_unmap(page, ttu_flags); - VM_BUG_ON_PAGE(!unmap_success, page); + try_to_unmap(page, ttu_flags); + + VM_WARN_ON_ONCE_PAGE(page_mapped(page), page); } static void remap_page(struct page *page, unsigned int nr) @@ -2659,7 +2671,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) struct deferred_split *ds_queue = get_deferred_split_queue(head); struct anon_vma *anon_vma = NULL; struct address_space *mapping = NULL; - int count, mapcount, extra_pins, ret; + int extra_pins, ret; pgoff_t end; VM_BUG_ON_PAGE(is_huge_zero_page(head), head); @@ -2718,7 +2730,6 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) } unmap_page(head); - VM_BUG_ON_PAGE(compound_mapcount(head), head); /* block interrupt reentry in xa_lock and spinlock */ local_irq_disable(); @@ -2736,9 +2747,7 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) /* Prevent deferred_split_scan() touching ->_refcount */ spin_lock(&ds_queue->split_queue_lock); - count = page_count(head); - mapcount = total_mapcount(head); - if (!mapcount && page_ref_freeze(head, 1 + extra_pins)) { + if (page_ref_freeze(head, 1 + extra_pins)) { if (!list_empty(page_deferred_list(head))) { ds_queue->split_queue_len--; list_del(page_deferred_list(head)); @@ -2758,16 +2767,9 @@ int split_huge_page_to_list(struct page *page, struct list_head *list) __split_huge_page(page, list, end); ret = 0; } else { - if (IS_ENABLED(CONFIG_DEBUG_VM) && mapcount) { - pr_alert("total_mapcount: %u, page_count(): %u\n", - mapcount, count); - if (PageTail(page)) - dump_page(head, NULL); - dump_page(page, "total_mapcount(head) > 0"); - BUG(); - } spin_unlock(&ds_queue->split_queue_lock); -fail: if (mapping) +fail: + if (mapping) xa_unlock(&mapping->i_pages); local_irq_enable(); remap_page(head, thp_nr_pages(head)); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 3db405dea3dc..5ba5a0da6d57 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1588,15 +1588,12 @@ struct address_space *hugetlb_page_mapping_lock_write(struct page *hpage) return NULL; } -pgoff_t __basepage_index(struct page *page) +pgoff_t hugetlb_basepage_index(struct page *page) { struct page *page_head = compound_head(page); pgoff_t index = page_index(page_head); unsigned long compound_idx; - if (!PageHuge(page_head)) - return page_index(page); - if (compound_order(page_head) >= MAX_ORDER) compound_idx = page_to_pfn(page) - page_to_pfn(page_head); else @@ -1793,7 +1790,7 @@ retry: SetPageHWPoison(page); ClearPageHWPoison(head); } - remove_hugetlb_page(h, page, false); + remove_hugetlb_page(h, head, false); h->max_huge_pages--; spin_unlock_irq(&hugetlb_lock); update_and_free_page(h, head); @@ -2121,12 +2118,18 @@ out: * be restored when a newly allocated huge page must be freed. It is * to be called after calling vma_needs_reservation to determine if a * reservation exists. + * + * vma_del_reservation is used in error paths where an entry in the reserve + * map was created during huge page allocation and must be removed. It is to + * be called after calling vma_needs_reservation to determine if a reservation + * exists. */ enum vma_resv_mode { VMA_NEEDS_RESV, VMA_COMMIT_RESV, VMA_END_RESV, VMA_ADD_RESV, + VMA_DEL_RESV, }; static long __vma_reservation_common(struct hstate *h, struct vm_area_struct *vma, unsigned long addr, @@ -2170,11 +2173,21 @@ static long __vma_reservation_common(struct hstate *h, ret = region_del(resv, idx, idx + 1); } break; + case VMA_DEL_RESV: + if (vma->vm_flags & VM_MAYSHARE) { + region_abort(resv, idx, idx + 1, 1); + ret = region_del(resv, idx, idx + 1); + } else { + ret = region_add(resv, idx, idx + 1, 1, NULL, NULL); + /* region_add calls of range 1 should never fail. */ + VM_BUG_ON(ret < 0); + } + break; default: BUG(); } - if (vma->vm_flags & VM_MAYSHARE) + if (vma->vm_flags & VM_MAYSHARE || mode == VMA_DEL_RESV) return ret; /* * We know private mapping must have HPAGE_RESV_OWNER set. @@ -2222,25 +2235,39 @@ static long vma_add_reservation(struct hstate *h, return __vma_reservation_common(h, vma, addr, VMA_ADD_RESV); } +static long vma_del_reservation(struct hstate *h, + struct vm_area_struct *vma, unsigned long addr) +{ + return __vma_reservation_common(h, vma, addr, VMA_DEL_RESV); +} + /* - * This routine is called to restore a reservation on error paths. In the - * specific error paths, a huge page was allocated (via alloc_huge_page) - * and is about to be freed. If a reservation for the page existed, - * alloc_huge_page would have consumed the reservation and set - * HPageRestoreReserve in the newly allocated page. When the page is freed - * via free_huge_page, the global reservation count will be incremented if - * HPageRestoreReserve is set. However, free_huge_page can not adjust the - * reserve map. Adjust the reserve map here to be consistent with global - * reserve count adjustments to be made by free_huge_page. + * This routine is called to restore reservation information on error paths. + * It should ONLY be called for pages allocated via alloc_huge_page(), and + * the hugetlb mutex should remain held when calling this routine. + * + * It handles two specific cases: + * 1) A reservation was in place and the page consumed the reservation. + * HPageRestoreReserve is set in the page. + * 2) No reservation was in place for the page, so HPageRestoreReserve is + * not set. However, alloc_huge_page always updates the reserve map. + * + * In case 1, free_huge_page later in the error path will increment the + * global reserve count. But, free_huge_page does not have enough context + * to adjust the reservation map. This case deals primarily with private + * mappings. Adjust the reserve map here to be consistent with global + * reserve count adjustments to be made by free_huge_page. Make sure the + * reserve map indicates there is a reservation present. + * + * In case 2, simply undo reserve map modifications done by alloc_huge_page. */ -static void restore_reserve_on_error(struct hstate *h, - struct vm_area_struct *vma, unsigned long address, - struct page *page) +void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, + unsigned long address, struct page *page) { - if (unlikely(HPageRestoreReserve(page))) { - long rc = vma_needs_reservation(h, vma, address); + long rc = vma_needs_reservation(h, vma, address); - if (unlikely(rc < 0)) { + if (HPageRestoreReserve(page)) { + if (unlikely(rc < 0)) /* * Rare out of memory condition in reserve map * manipulation. Clear HPageRestoreReserve so that @@ -2253,16 +2280,57 @@ static void restore_reserve_on_error(struct hstate *h, * accounting of reserve counts. */ ClearHPageRestoreReserve(page); - } else if (rc) { - rc = vma_add_reservation(h, vma, address); - if (unlikely(rc < 0)) + else if (rc) + (void)vma_add_reservation(h, vma, address); + else + vma_end_reservation(h, vma, address); + } else { + if (!rc) { + /* + * This indicates there is an entry in the reserve map + * added by alloc_huge_page. We know it was added + * before the alloc_huge_page call, otherwise + * HPageRestoreReserve would be set on the page. + * Remove the entry so that a subsequent allocation + * does not consume a reservation. + */ + rc = vma_del_reservation(h, vma, address); + if (rc < 0) /* - * See above comment about rare out of - * memory condition. + * VERY rare out of memory condition. Since + * we can not delete the entry, set + * HPageRestoreReserve so that the reserve + * count will be incremented when the page + * is freed. This reserve will be consumed + * on a subsequent allocation. */ - ClearHPageRestoreReserve(page); + SetHPageRestoreReserve(page); + } else if (rc < 0) { + /* + * Rare out of memory condition from + * vma_needs_reservation call. Memory allocation is + * only attempted if a new entry is needed. Therefore, + * this implies there is not an entry in the + * reserve map. + * + * For shared mappings, no entry in the map indicates + * no reservation. We are done. + */ + if (!(vma->vm_flags & VM_MAYSHARE)) + /* + * For private mappings, no entry indicates + * a reservation is present. Since we can + * not add an entry, set SetHPageRestoreReserve + * on the page so reserve count will be + * incremented when freed. This reserve will + * be consumed on a subsequent allocation. + */ + SetHPageRestoreReserve(page); } else - vma_end_reservation(h, vma, address); + /* + * No reservation present, do nothing + */ + vma_end_reservation(h, vma, address); } } @@ -4037,6 +4105,8 @@ again: spin_lock_nested(src_ptl, SINGLE_DEPTH_NESTING); entry = huge_ptep_get(src_pte); if (!pte_same(src_pte_old, entry)) { + restore_reserve_on_error(h, vma, addr, + new); put_page(new); /* dst_entry won't change as in child */ goto again; @@ -4056,6 +4126,7 @@ again: * See Documentation/vm/mmu_notifier.rst */ huge_ptep_set_wrprotect(src, addr, src_pte); + entry = huge_pte_wrprotect(entry); } page_dup_rmap(ptepage, true); @@ -4888,10 +4959,20 @@ int hugetlb_mcopy_atomic_pte(struct mm_struct *dst_mm, if (!page) goto out; } else if (!*pagep) { - ret = -ENOMEM; + /* If a page already exists, then it's UFFDIO_COPY for + * a non-missing case. Return -EEXIST. + */ + if (vm_shared && + hugetlbfs_pagecache_present(h, dst_vma, dst_addr)) { + ret = -EEXIST; + goto out; + } + page = alloc_huge_page(dst_vma, dst_addr, 0); - if (IS_ERR(page)) + if (IS_ERR(page)) { + ret = -ENOMEM; goto out; + } ret = copy_huge_page_from_user(page, (const void __user *) src_addr, @@ -4995,6 +5076,7 @@ out_release_unlock: if (vm_shared || is_continue) unlock_page(page); out_release_nounlock: + restore_reserve_on_error(h, dst_vma, dst_addr, page); put_page(page); goto out; } @@ -5846,6 +5928,21 @@ unlock: return ret; } +int get_hwpoison_huge_page(struct page *page, bool *hugetlb) +{ + int ret = 0; + + *hugetlb = false; + spin_lock_irq(&hugetlb_lock); + if (PageHeadHuge(page)) { + *hugetlb = true; + if (HPageFreed(page) || HPageMigratable(page)) + ret = get_page_unless_zero(page); + } + spin_unlock_irq(&hugetlb_lock); + return ret; +} + void putback_active_hugepage(struct page *page) { spin_lock_irq(&hugetlb_lock); diff --git a/mm/internal.h b/mm/internal.h index 54bd0dc2c23c..e8fdb531f887 100644 --- a/mm/internal.h +++ b/mm/internal.h @@ -96,26 +96,6 @@ static inline void set_page_refcounted(struct page *page) set_page_count(page, 1); } -/* - * When kernel touch the user page, the user page may be have been marked - * poison but still mapped in user space, if without this page, the kernel - * can guarantee the data integrity and operation success, the kernel is - * better to check the posion status and avoid touching it, be good not to - * panic, coredump for process fatal signal is a sample case matching this - * scenario. Or if kernel can't guarantee the data integrity, it's better - * not to call this function, let kernel touch the poison page and get to - * panic. - */ -static inline bool is_page_poisoned(struct page *page) -{ - if (PageHWPoison(page)) - return true; - else if (PageHuge(page) && PageHWPoison(compound_head(page))) - return true; - - return false; -} - extern unsigned long highest_memmap_pfn; /* @@ -404,27 +384,52 @@ static inline void mlock_migrate_page(struct page *newpage, struct page *page) extern pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma); /* - * At what user virtual address is page expected in @vma? + * At what user virtual address is page expected in vma? + * Returns -EFAULT if all of the page is outside the range of vma. + * If page is a compound head, the entire compound page is considered. */ static inline unsigned long -__vma_address(struct page *page, struct vm_area_struct *vma) +vma_address(struct page *page, struct vm_area_struct *vma) { - pgoff_t pgoff = page_to_pgoff(page); - return vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + pgoff_t pgoff; + unsigned long address; + + VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ + pgoff = page_to_pgoff(page); + if (pgoff >= vma->vm_pgoff) { + address = vma->vm_start + + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + /* Check for address beyond vma (or wrapped through 0?) */ + if (address < vma->vm_start || address >= vma->vm_end) + address = -EFAULT; + } else if (PageHead(page) && + pgoff + compound_nr(page) - 1 >= vma->vm_pgoff) { + /* Test above avoids possibility of wrap to 0 on 32-bit */ + address = vma->vm_start; + } else { + address = -EFAULT; + } + return address; } +/* + * Then at what user virtual address will none of the page be found in vma? + * Assumes that vma_address() already returned a good starting address. + * If page is a compound head, the entire compound page is considered. + */ static inline unsigned long -vma_address(struct page *page, struct vm_area_struct *vma) +vma_address_end(struct page *page, struct vm_area_struct *vma) { - unsigned long start, end; - - start = __vma_address(page, vma); - end = start + thp_size(page) - PAGE_SIZE; - - /* page should be within @vma mapping range */ - VM_BUG_ON_VMA(end < vma->vm_start || start >= vma->vm_end, vma); - - return max(start, vma->vm_start); + pgoff_t pgoff; + unsigned long address; + + VM_BUG_ON_PAGE(PageKsm(page), page); /* KSM page->index unusable */ + pgoff = page_to_pgoff(page) + compound_nr(page); + address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); + /* Check for address beyond vma (or wrapped through 0?) */ + if (address < vma->vm_start || address > vma->vm_end) + address = vma->vm_end; + return address; } static inline struct file *maybe_unlock_mmap_for_io(struct vm_fault *vmf, diff --git a/mm/ioremap.c b/mm/ioremap.c index d1dcc7e744ac..8ee0136f8cb0 100644 --- a/mm/ioremap.c +++ b/mm/ioremap.c @@ -16,16 +16,16 @@ #include "pgalloc-track.h" #ifdef CONFIG_HAVE_ARCH_HUGE_VMAP -static bool __ro_after_init iomap_max_page_shift = PAGE_SHIFT; +static unsigned int __ro_after_init iomap_max_page_shift = BITS_PER_LONG - 1; static int __init set_nohugeiomap(char *str) { - iomap_max_page_shift = P4D_SHIFT; + iomap_max_page_shift = PAGE_SHIFT; return 0; } early_param("nohugeiomap", set_nohugeiomap); #else /* CONFIG_HAVE_ARCH_HUGE_VMAP */ -static const bool iomap_max_page_shift = PAGE_SHIFT; +static const unsigned int iomap_max_page_shift = PAGE_SHIFT; #endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */ int ioremap_page_range(unsigned long addr, diff --git a/mm/kasan/common.c b/mm/kasan/common.c index 6bb87f2acd4e..0ecd293af344 100644 --- a/mm/kasan/common.c +++ b/mm/kasan/common.c @@ -97,7 +97,7 @@ slab_flags_t __kasan_never_merge(void) return 0; } -void __kasan_alloc_pages(struct page *page, unsigned int order, bool init) +void __kasan_unpoison_pages(struct page *page, unsigned int order, bool init) { u8 tag; unsigned long i; @@ -111,7 +111,7 @@ void __kasan_alloc_pages(struct page *page, unsigned int order, bool init) kasan_unpoison(page_address(page), PAGE_SIZE << order, init); } -void __kasan_free_pages(struct page *page, unsigned int order, bool init) +void __kasan_poison_pages(struct page *page, unsigned int order, bool init) { if (likely(!PageHighMem(page))) kasan_poison(page_address(page), PAGE_SIZE << order, diff --git a/mm/kasan/hw_tags.c b/mm/kasan/hw_tags.c index 4004388b4e4b..ed5e5b833d61 100644 --- a/mm/kasan/hw_tags.c +++ b/mm/kasan/hw_tags.c @@ -238,6 +238,38 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, return &alloc_meta->free_track[0]; } +void kasan_alloc_pages(struct page *page, unsigned int order, gfp_t flags) +{ + /* + * This condition should match the one in post_alloc_hook() in + * page_alloc.c. + */ + bool init = !want_init_on_free() && want_init_on_alloc(flags); + + if (flags & __GFP_SKIP_KASAN_POISON) + SetPageSkipKASanPoison(page); + + if (flags & __GFP_ZEROTAGS) { + int i; + + for (i = 0; i != 1 << order; ++i) + tag_clear_highpage(page + i); + } else { + kasan_unpoison_pages(page, order, init); + } +} + +void kasan_free_pages(struct page *page, unsigned int order) +{ + /* + * This condition should match the one in free_pages_prepare() in + * page_alloc.c. + */ + bool init = want_init_on_free(); + + kasan_poison_pages(page, order, init); +} + #if IS_ENABLED(CONFIG_KASAN_KUNIT_TEST) void kasan_set_tagging_report_once(bool state) diff --git a/mm/kasan/init.c b/mm/kasan/init.c index c4605ac9837b..348f31d15a97 100644 --- a/mm/kasan/init.c +++ b/mm/kasan/init.c @@ -220,8 +220,8 @@ static int __ref zero_p4d_populate(pgd_t *pgd, unsigned long addr, /** * kasan_populate_early_shadow - populate shadow memory region with * kasan_early_shadow_page - * @shadow_start - start of the memory range to populate - * @shadow_end - end of the memory range to populate + * @shadow_start: start of the memory range to populate + * @shadow_end: end of the memory range to populate */ int __ref kasan_populate_early_shadow(const void *shadow_start, const void *shadow_end) diff --git a/mm/kasan/sw_tags.c b/mm/kasan/sw_tags.c index 9df8e7f69e87..9362938abbfa 100644 --- a/mm/kasan/sw_tags.c +++ b/mm/kasan/sw_tags.c @@ -207,3 +207,10 @@ struct kasan_track *kasan_get_free_track(struct kmem_cache *cache, return &alloc_meta->free_track[i]; } + +void kasan_tag_mismatch(unsigned long addr, unsigned long access_info, + unsigned long ret_ip) +{ + kasan_report(addr, 1 << (access_info & 0xf), access_info & 0x10, + ret_ip); +} diff --git a/mm/kfence/core.c b/mm/kfence/core.c index e18fbbd5d9b4..4d21ac44d5d3 100644 --- a/mm/kfence/core.c +++ b/mm/kfence/core.c @@ -627,10 +627,10 @@ static void toggle_allocation_gate(struct work_struct *work) * During low activity with no allocations we might wait a * while; let's avoid the hung task warning. */ - wait_event_timeout(allocation_wait, atomic_read(&kfence_allocation_gate), - sysctl_hung_task_timeout_secs * HZ / 2); + wait_event_idle_timeout(allocation_wait, atomic_read(&kfence_allocation_gate), + sysctl_hung_task_timeout_secs * HZ / 2); } else { - wait_event(allocation_wait, atomic_read(&kfence_allocation_gate)); + wait_event_idle(allocation_wait, atomic_read(&kfence_allocation_gate)); } /* Disable static key and reset timer. */ diff --git a/mm/ksm.c b/mm/ksm.c index 6bbe314c5260..2f3aaeb34a42 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -776,11 +776,12 @@ static void remove_rmap_item_from_tree(struct rmap_item *rmap_item) struct page *page; stable_node = rmap_item->head; - page = get_ksm_page(stable_node, GET_KSM_PAGE_NOLOCK); + page = get_ksm_page(stable_node, GET_KSM_PAGE_LOCK); if (!page) goto out; hlist_del(&rmap_item->hlist); + unlock_page(page); put_page(page); if (!hlist_empty(&stable_node->hlist)) diff --git a/mm/memfd.c b/mm/memfd.c index 2647c898990c..081dd33e6a61 100644 --- a/mm/memfd.c +++ b/mm/memfd.c @@ -297,9 +297,9 @@ SYSCALL_DEFINE2(memfd_create, } if (flags & MFD_HUGETLB) { - struct user_struct *user = NULL; + struct ucounts *ucounts = NULL; - file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, + file = hugetlb_file_setup(name, 0, VM_NORESERVE, &ucounts, HUGETLB_ANONHUGE_INODE, (flags >> MFD_HUGE_SHIFT) & MFD_HUGE_MASK); diff --git a/mm/memory-failure.c b/mm/memory-failure.c index 85ad98c00fd9..6f5f78885ab4 100644 --- a/mm/memory-failure.c +++ b/mm/memory-failure.c @@ -658,6 +658,7 @@ static int truncate_error_page(struct page *p, unsigned long pfn, */ static int me_kernel(struct page *p, unsigned long pfn) { + unlock_page(p); return MF_IGNORED; } @@ -667,6 +668,7 @@ static int me_kernel(struct page *p, unsigned long pfn) static int me_unknown(struct page *p, unsigned long pfn) { pr_err("Memory failure: %#lx: Unknown page state\n", pfn); + unlock_page(p); return MF_FAILED; } @@ -675,6 +677,7 @@ static int me_unknown(struct page *p, unsigned long pfn) */ static int me_pagecache_clean(struct page *p, unsigned long pfn) { + int ret; struct address_space *mapping; delete_from_lru_cache(p); @@ -683,8 +686,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * For anonymous pages we're done the only reference left * should be the one m_f() holds. */ - if (PageAnon(p)) - return MF_RECOVERED; + if (PageAnon(p)) { + ret = MF_RECOVERED; + goto out; + } /* * Now truncate the page in the page cache. This is really @@ -698,7 +703,8 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) /* * Page has been teared down in the meanwhile */ - return MF_FAILED; + ret = MF_FAILED; + goto out; } /* @@ -706,7 +712,10 @@ static int me_pagecache_clean(struct page *p, unsigned long pfn) * * Open: to take i_mutex or not for this? Right now we don't. */ - return truncate_error_page(p, pfn, mapping); + ret = truncate_error_page(p, pfn, mapping); +out: + unlock_page(p); + return ret; } /* @@ -782,24 +791,26 @@ static int me_pagecache_dirty(struct page *p, unsigned long pfn) */ static int me_swapcache_dirty(struct page *p, unsigned long pfn) { + int ret; + ClearPageDirty(p); /* Trigger EIO in shmem: */ ClearPageUptodate(p); - if (!delete_from_lru_cache(p)) - return MF_DELAYED; - else - return MF_FAILED; + ret = delete_from_lru_cache(p) ? MF_FAILED : MF_DELAYED; + unlock_page(p); + return ret; } static int me_swapcache_clean(struct page *p, unsigned long pfn) { + int ret; + delete_from_swap_cache(p); - if (!delete_from_lru_cache(p)) - return MF_RECOVERED; - else - return MF_FAILED; + ret = delete_from_lru_cache(p) ? MF_FAILED : MF_RECOVERED; + unlock_page(p); + return ret; } /* @@ -820,6 +831,7 @@ static int me_huge_page(struct page *p, unsigned long pfn) mapping = page_mapping(hpage); if (mapping) { res = truncate_error_page(hpage, pfn, mapping); + unlock_page(hpage); } else { res = MF_FAILED; unlock_page(hpage); @@ -834,7 +846,6 @@ static int me_huge_page(struct page *p, unsigned long pfn) page_ref_inc(p); res = MF_RECOVERED; } - lock_page(hpage); } return res; @@ -866,6 +877,8 @@ static struct page_state { unsigned long mask; unsigned long res; enum mf_action_page_type type; + + /* Callback ->action() has to unlock the relevant page inside it. */ int (*action)(struct page *p, unsigned long pfn); } error_states[] = { { reserved, reserved, MF_MSG_KERNEL, me_kernel }, @@ -929,6 +942,7 @@ static int page_action(struct page_state *ps, struct page *p, int result; int count; + /* page p should be unlocked after returning from ps->action(). */ result = ps->action(p, pfn); count = page_count(p) - 1; @@ -949,6 +963,17 @@ static int page_action(struct page_state *ps, struct page *p, return (result == MF_RECOVERED || result == MF_DELAYED) ? 0 : -EBUSY; } +/* + * Return true if a page type of a given page is supported by hwpoison + * mechanism (while handling could fail), otherwise false. This function + * does not return true for hugetlb or device memory pages, so it's assumed + * to be called only in the context where we never have such pages. + */ +static inline bool HWPoisonHandlable(struct page *page) +{ + return PageLRU(page) || __PageMovable(page); +} + /** * __get_hwpoison_page() - Get refcount for memory error handling: * @page: raw error page (hit by memory error) @@ -959,8 +984,22 @@ static int page_action(struct page_state *ps, struct page *p, static int __get_hwpoison_page(struct page *page) { struct page *head = compound_head(page); + int ret = 0; + bool hugetlb = false; + + ret = get_hwpoison_huge_page(head, &hugetlb); + if (hugetlb) + return ret; + + /* + * This check prevents from calling get_hwpoison_unless_zero() + * for any unsupported type of page in order to reduce the risk of + * unexpected races caused by taking a page refcount. + */ + if (!HWPoisonHandlable(head)) + return 0; - if (!PageHuge(head) && PageTransHuge(head)) { + if (PageTransHuge(head)) { /* * Non anonymous thp exists only in allocation/free time. We * can't handle such a case correctly, so let's give it up. @@ -1017,7 +1056,7 @@ try_again: ret = -EIO; } } else { - if (PageHuge(p) || PageLRU(p) || __PageMovable(p)) { + if (PageHuge(p) || HWPoisonHandlable(p)) { ret = 1; } else { /* @@ -1228,7 +1267,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) if (TestSetPageHWPoison(head)) { pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); - return 0; + return -EHWPOISON; } num_poisoned_pages_inc(); @@ -1288,7 +1327,7 @@ static int memory_failure_hugetlb(unsigned long pfn, int flags) goto out; } - res = identify_page_state(pfn, p, page_flags); + return identify_page_state(pfn, p, page_flags); out: unlock_page(head); return res; @@ -1404,9 +1443,10 @@ int memory_failure(unsigned long pfn, int flags) struct page *hpage; struct page *orig_head; struct dev_pagemap *pgmap; - int res; + int res = 0; unsigned long page_flags; bool retry = true; + static DEFINE_MUTEX(mf_mutex); if (!sysctl_memory_failure_recovery) panic("Memory failure on page %lx", pfn); @@ -1424,13 +1464,19 @@ int memory_failure(unsigned long pfn, int flags) return -ENXIO; } + mutex_lock(&mf_mutex); + try_again: - if (PageHuge(p)) - return memory_failure_hugetlb(pfn, flags); + if (PageHuge(p)) { + res = memory_failure_hugetlb(pfn, flags); + goto unlock_mutex; + } + if (TestSetPageHWPoison(p)) { pr_err("Memory failure: %#lx: already hardware poisoned\n", pfn); - return 0; + res = -EHWPOISON; + goto unlock_mutex; } orig_head = hpage = compound_head(p); @@ -1463,17 +1509,19 @@ try_again: res = MF_FAILED; } action_result(pfn, MF_MSG_BUDDY, res); - return res == MF_RECOVERED ? 0 : -EBUSY; + res = res == MF_RECOVERED ? 0 : -EBUSY; } else { action_result(pfn, MF_MSG_KERNEL_HIGH_ORDER, MF_IGNORED); - return -EBUSY; + res = -EBUSY; } + goto unlock_mutex; } if (PageTransHuge(hpage)) { if (try_to_split_thp_page(p, "Memory Failure") < 0) { action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED); - return -EBUSY; + res = -EBUSY; + goto unlock_mutex; } VM_BUG_ON_PAGE(!page_count(p), p); } @@ -1497,7 +1545,7 @@ try_again: if (PageCompound(p) && compound_head(p) != orig_head) { action_result(pfn, MF_MSG_DIFFERENT_COMPOUND, MF_IGNORED); res = -EBUSY; - goto out; + goto unlock_page; } /* @@ -1517,17 +1565,22 @@ try_again: num_poisoned_pages_dec(); unlock_page(p); put_page(p); - return 0; + goto unlock_mutex; } if (hwpoison_filter(p)) { if (TestClearPageHWPoison(p)) num_poisoned_pages_dec(); unlock_page(p); put_page(p); - return 0; + goto unlock_mutex; } - if (!PageTransTail(p) && !PageLRU(p)) + /* + * __munlock_pagevec may clear a writeback page's LRU flag without + * page_lock. We need wait writeback completion for this page or it + * may trigger vfs BUG while evict inode. + */ + if (!PageTransTail(p) && !PageLRU(p) && !PageWriteback(p)) goto identify_page_state; /* @@ -1543,7 +1596,7 @@ try_again: if (!hwpoison_user_mappings(p, pfn, flags, &p)) { action_result(pfn, MF_MSG_UNMAP_FAILED, MF_IGNORED); res = -EBUSY; - goto out; + goto unlock_page; } /* @@ -1552,13 +1605,17 @@ try_again: if (PageLRU(p) && !PageSwapCache(p) && p->mapping == NULL) { action_result(pfn, MF_MSG_TRUNCATED_LRU, MF_IGNORED); res = -EBUSY; - goto out; + goto unlock_page; } identify_page_state: res = identify_page_state(pfn, p, page_flags); -out: + mutex_unlock(&mf_mutex); + return res; +unlock_page: unlock_page(p); +unlock_mutex: + mutex_unlock(&mf_mutex); return res; } EXPORT_SYMBOL_GPL(memory_failure); diff --git a/mm/memory.c b/mm/memory.c index 730daa00952b..486f4a2874e7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1361,7 +1361,18 @@ static inline unsigned long zap_pmd_range(struct mmu_gather *tlb, else if (zap_huge_pmd(tlb, vma, pmd, addr)) goto next; /* fall through */ + } else if (details && details->single_page && + PageTransCompound(details->single_page) && + next - addr == HPAGE_PMD_SIZE && pmd_none(*pmd)) { + spinlock_t *ptl = pmd_lock(tlb->mm, pmd); + /* + * Take and drop THP pmd lock so that we cannot return + * prematurely, while zap_huge_pmd() has cleared *pmd, + * but not yet decremented compound_mapcount(). + */ + spin_unlock(ptl); } + /* * Here there can be other concurrent MADV_DONTNEED or * trans huge page faults running, and if the pmd is @@ -2939,6 +2950,7 @@ static vm_fault_t wp_page_copy(struct vm_fault *vmf) } flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); entry = mk_pte(new_page, vma->vm_page_prot); + entry = pte_sw_mkyoung(entry); entry = maybe_mkwrite(pte_mkdirty(entry), vma); /* @@ -3236,6 +3248,36 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, } /** + * unmap_mapping_page() - Unmap single page from processes. + * @page: The locked page to be unmapped. + * + * Unmap this page from any userspace process which still has it mmaped. + * Typically, for efficiency, the range of nearby pages has already been + * unmapped by unmap_mapping_pages() or unmap_mapping_range(). But once + * truncation or invalidation holds the lock on a page, it may find that + * the page has been remapped again: and then uses unmap_mapping_page() + * to unmap it finally. + */ +void unmap_mapping_page(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct zap_details details = { }; + + VM_BUG_ON(!PageLocked(page)); + VM_BUG_ON(PageTail(page)); + + details.check_mapping = mapping; + details.first_index = page->index; + details.last_index = page->index + thp_nr_pages(page) - 1; + details.single_page = page; + + i_mmap_lock_write(mapping); + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) + unmap_mapping_range_tree(&mapping->i_mmap, &details); + i_mmap_unlock_write(mapping); +} + +/** * unmap_mapping_pages() - Unmap pages from processes. * @mapping: The address space containing pages to be unmapped. * @start: Index of first page to be unmapped. @@ -3602,6 +3644,7 @@ static vm_fault_t do_anonymous_page(struct vm_fault *vmf) __SetPageUptodate(page); entry = mk_pte(page, vma->vm_page_prot); + entry = pte_sw_mkyoung(entry); if (vma->vm_flags & VM_WRITE) entry = pte_mkwrite(pte_mkdirty(entry)); @@ -3786,6 +3829,8 @@ void do_set_pte(struct vm_fault *vmf, struct page *page, unsigned long addr) if (prefault && arch_wants_old_prefaulted_pte()) entry = pte_mkold(entry); + else + entry = pte_sw_mkyoung(entry); if (write) entry = maybe_mkwrite(pte_mkdirty(entry), vma); diff --git a/mm/mempool.c b/mm/mempool.c index a258cf4de575..0b8afbec3e35 100644 --- a/mm/mempool.c +++ b/mm/mempool.c @@ -106,7 +106,8 @@ static __always_inline void kasan_poison_element(mempool_t *pool, void *element) if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) kasan_slab_free_mempool(element); else if (pool->alloc == mempool_alloc_pages) - kasan_free_pages(element, (unsigned long)pool->pool_data, false); + kasan_poison_pages(element, (unsigned long)pool->pool_data, + false); } static void kasan_unpoison_element(mempool_t *pool, void *element) @@ -114,7 +115,8 @@ static void kasan_unpoison_element(mempool_t *pool, void *element) if (pool->alloc == mempool_alloc_slab || pool->alloc == mempool_kmalloc) kasan_unpoison_range(element, __ksize(element)); else if (pool->alloc == mempool_alloc_pages) - kasan_alloc_pages(element, (unsigned long)pool->pool_data, false); + kasan_unpoison_pages(element, (unsigned long)pool->pool_data, + false); } static __always_inline void add_element(mempool_t *pool, void *element) diff --git a/mm/migrate.c b/mm/migrate.c index b234c3f3acb7..41ff2c9896c4 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -295,6 +295,7 @@ void __migration_entry_wait(struct mm_struct *mm, pte_t *ptep, goto out; page = migration_entry_to_page(entry); + page = compound_head(page); /* * Once page cache replacement of page migration started, page_count diff --git a/mm/mlock.c b/mm/mlock.c index df590fda5688..e338ebc4ad29 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -817,9 +817,10 @@ SYSCALL_DEFINE0(munlockall) */ static DEFINE_SPINLOCK(shmlock_user_lock); -int user_shm_lock(size_t size, struct user_struct *user) +int user_shm_lock(size_t size, struct ucounts *ucounts) { unsigned long lock_limit, locked; + long memlock; int allowed = 0; locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -828,21 +829,26 @@ int user_shm_lock(size_t size, struct user_struct *user) allowed = 1; lock_limit >>= PAGE_SHIFT; spin_lock(&shmlock_user_lock); - if (!allowed && - locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK)) + memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); + + if (!allowed && (memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) { + dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); + goto out; + } + if (!get_ucounts(ucounts)) { + dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked); goto out; - get_uid(user); - user->locked_shm += locked; + } allowed = 1; out: spin_unlock(&shmlock_user_lock); return allowed; } -void user_shm_unlock(size_t size, struct user_struct *user) +void user_shm_unlock(size_t size, struct ucounts *ucounts) { spin_lock(&shmlock_user_lock); - user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT; + dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT); spin_unlock(&shmlock_user_lock); - free_uid(user); + put_ucounts(ucounts); } diff --git a/mm/mmap.c b/mm/mmap.c index 0584e540246e..bc88d1674364 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -1611,7 +1611,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, goto out_fput; } } else if (flags & MAP_HUGETLB) { - struct user_struct *user = NULL; + struct ucounts *ucounts = NULL; struct hstate *hs; hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); @@ -1627,7 +1627,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, */ file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, VM_NORESERVE, - &user, HUGETLB_ANONHUGE_INODE, + &ucounts, HUGETLB_ANONHUGE_INODE, (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); if (IS_ERR(file)) return PTR_ERR(file); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index aaa1655cf682..e7af86e1a312 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -382,7 +382,7 @@ int page_group_by_mobility_disabled __read_mostly; static DEFINE_STATIC_KEY_TRUE(deferred_pages); /* - * Calling kasan_free_pages() only after deferred memory initialization + * Calling kasan_poison_pages() only after deferred memory initialization * has completed. Poisoning pages during deferred memory init will greatly * lengthen the process and cause problem in large memory systems as the * deferred pages initialization is done with interrupt disabled. @@ -394,15 +394,12 @@ static DEFINE_STATIC_KEY_TRUE(deferred_pages); * on-demand allocation and then freed again before the deferred pages * initialization is done, but this is not likely to happen. */ -static inline void kasan_free_nondeferred_pages(struct page *page, int order, - bool init, fpi_t fpi_flags) +static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) { - if (static_branch_unlikely(&deferred_pages)) - return; - if (!IS_ENABLED(CONFIG_KASAN_GENERIC) && - (fpi_flags & FPI_SKIP_KASAN_POISON)) - return; - kasan_free_pages(page, order, init); + return static_branch_unlikely(&deferred_pages) || + (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) || + PageSkipKASanPoison(page); } /* Returns true if the struct page for the pfn is uninitialised */ @@ -453,13 +450,11 @@ defer_init(int nid, unsigned long pfn, unsigned long end_pfn) return false; } #else -static inline void kasan_free_nondeferred_pages(struct page *page, int order, - bool init, fpi_t fpi_flags) +static inline bool should_skip_kasan_poison(struct page *page, fpi_t fpi_flags) { - if (!IS_ENABLED(CONFIG_KASAN_GENERIC) && - (fpi_flags & FPI_SKIP_KASAN_POISON)) - return; - kasan_free_pages(page, order, init); + return (!IS_ENABLED(CONFIG_KASAN_GENERIC) && + (fpi_flags & FPI_SKIP_KASAN_POISON)) || + PageSkipKASanPoison(page); } static inline bool early_page_uninitialised(unsigned long pfn) @@ -1226,10 +1221,16 @@ out: return ret; } -static void kernel_init_free_pages(struct page *page, int numpages) +static void kernel_init_free_pages(struct page *page, int numpages, bool zero_tags) { int i; + if (zero_tags) { + for (i = 0; i < numpages; i++) + tag_clear_highpage(page + i); + return; + } + /* s390's use of memset() could override KASAN redzones. */ kasan_disable_current(); for (i = 0; i < numpages; i++) { @@ -1245,7 +1246,7 @@ static __always_inline bool free_pages_prepare(struct page *page, unsigned int order, bool check_free, fpi_t fpi_flags) { int bad = 0; - bool init; + bool skip_kasan_poison = should_skip_kasan_poison(page, fpi_flags); VM_BUG_ON_PAGE(PageTail(page), page); @@ -1314,10 +1315,17 @@ static __always_inline bool free_pages_prepare(struct page *page, * With hardware tag-based KASAN, memory tags must be set before the * page becomes unavailable via debug_pagealloc or arch_free_page. */ - init = want_init_on_free(); - if (init && !kasan_has_integrated_init()) - kernel_init_free_pages(page, 1 << order); - kasan_free_nondeferred_pages(page, order, init, fpi_flags); + if (kasan_has_integrated_init()) { + if (!skip_kasan_poison) + kasan_free_pages(page, order); + } else { + bool init = want_init_on_free(); + + if (init) + kernel_init_free_pages(page, 1 << order, false); + if (!skip_kasan_poison) + kasan_poison_pages(page, order, init); + } /* * arch_free_page() can make the page's contents inaccessible. s390 @@ -2324,8 +2332,6 @@ static bool check_new_pages(struct page *page, unsigned int order) inline void post_alloc_hook(struct page *page, unsigned int order, gfp_t gfp_flags) { - bool init; - set_page_private(page, 0); set_page_refcounted(page); @@ -2344,10 +2350,16 @@ inline void post_alloc_hook(struct page *page, unsigned int order, * kasan_alloc_pages and kernel_init_free_pages must be * kept together to avoid discrepancies in behavior. */ - init = !want_init_on_free() && want_init_on_alloc(gfp_flags); - kasan_alloc_pages(page, order, init); - if (init && !kasan_has_integrated_init()) - kernel_init_free_pages(page, 1 << order); + if (kasan_has_integrated_init()) { + kasan_alloc_pages(page, order, gfp_flags); + } else { + bool init = !want_init_on_free() && want_init_on_alloc(gfp_flags); + + kasan_unpoison_pages(page, order, init); + if (init) + kernel_init_free_pages(page, 1 << order, + gfp_flags & __GFP_ZEROTAGS); + } set_page_owner(page, order, gfp_flags); } @@ -5053,9 +5065,13 @@ unsigned long __alloc_pages_bulk(gfp_t gfp, int preferred_nid, * Skip populated array elements to determine if any pages need * to be allocated before disabling IRQs. */ - while (page_array && page_array[nr_populated] && nr_populated < nr_pages) + while (page_array && nr_populated < nr_pages && page_array[nr_populated]) nr_populated++; + /* Already populated array? */ + if (unlikely(page_array && nr_pages - nr_populated == 0)) + return nr_populated; + /* Use the single page allocator for one page. */ if (nr_pages - nr_populated == 1) goto failed; @@ -9158,6 +9174,8 @@ bool take_page_off_buddy(struct page *page) del_page_from_free_list(page_head, zone, page_order); break_down_buddy_pages(zone, page_head, page, 0, page_order, migratetype); + if (!is_migrate_isolate(migratetype)) + __mod_zone_freepage_state(zone, -1, migratetype); ret = true; break; } diff --git a/mm/page_vma_mapped.c b/mm/page_vma_mapped.c index 2cf01d933f13..a4435311754b 100644 --- a/mm/page_vma_mapped.c +++ b/mm/page_vma_mapped.c @@ -116,6 +116,13 @@ static bool check_pte(struct page_vma_mapped_walk *pvmw) return pfn_is_match(pvmw->page, pfn); } +static void step_forward(struct page_vma_mapped_walk *pvmw, unsigned long size) +{ + pvmw->address = (pvmw->address + size) & ~(size - 1); + if (!pvmw->address) + pvmw->address = ULONG_MAX; +} + /** * page_vma_mapped_walk - check if @pvmw->page is mapped in @pvmw->vma at * @pvmw->address @@ -144,6 +151,7 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) { struct mm_struct *mm = pvmw->vma->vm_mm; struct page *page = pvmw->page; + unsigned long end; pgd_t *pgd; p4d_t *p4d; pud_t *pud; @@ -153,10 +161,11 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) if (pvmw->pmd && !pvmw->pte) return not_found(pvmw); - if (pvmw->pte) - goto next_pte; + if (unlikely(PageHuge(page))) { + /* The only possible mapping was handled on last iteration */ + if (pvmw->pte) + return not_found(pvmw); - if (unlikely(PageHuge(pvmw->page))) { /* when pud is not present, pte will be NULL */ pvmw->pte = huge_pte_offset(mm, pvmw->address, page_size(page)); if (!pvmw->pte) @@ -168,78 +177,108 @@ bool page_vma_mapped_walk(struct page_vma_mapped_walk *pvmw) return not_found(pvmw); return true; } -restart: - pgd = pgd_offset(mm, pvmw->address); - if (!pgd_present(*pgd)) - return false; - p4d = p4d_offset(pgd, pvmw->address); - if (!p4d_present(*p4d)) - return false; - pud = pud_offset(p4d, pvmw->address); - if (!pud_present(*pud)) - return false; - pvmw->pmd = pmd_offset(pud, pvmw->address); + /* - * Make sure the pmd value isn't cached in a register by the - * compiler and used as a stale value after we've observed a - * subsequent update. + * Seek to next pte only makes sense for THP. + * But more important than that optimization, is to filter out + * any PageKsm page: whose page->index misleads vma_address() + * and vma_address_end() to disaster. */ - pmde = READ_ONCE(*pvmw->pmd); - if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) { - pvmw->ptl = pmd_lock(mm, pvmw->pmd); - if (likely(pmd_trans_huge(*pvmw->pmd))) { - if (pvmw->flags & PVMW_MIGRATION) - return not_found(pvmw); - if (pmd_page(*pvmw->pmd) != page) - return not_found(pvmw); - return true; - } else if (!pmd_present(*pvmw->pmd)) { - if (thp_migration_supported()) { - if (!(pvmw->flags & PVMW_MIGRATION)) + end = PageTransCompound(page) ? + vma_address_end(page, pvmw->vma) : + pvmw->address + PAGE_SIZE; + if (pvmw->pte) + goto next_pte; +restart: + do { + pgd = pgd_offset(mm, pvmw->address); + if (!pgd_present(*pgd)) { + step_forward(pvmw, PGDIR_SIZE); + continue; + } + p4d = p4d_offset(pgd, pvmw->address); + if (!p4d_present(*p4d)) { + step_forward(pvmw, P4D_SIZE); + continue; + } + pud = pud_offset(p4d, pvmw->address); + if (!pud_present(*pud)) { + step_forward(pvmw, PUD_SIZE); + continue; + } + + pvmw->pmd = pmd_offset(pud, pvmw->address); + /* + * Make sure the pmd value isn't cached in a register by the + * compiler and used as a stale value after we've observed a + * subsequent update. + */ + pmde = READ_ONCE(*pvmw->pmd); + + if (pmd_trans_huge(pmde) || is_pmd_migration_entry(pmde)) { + pvmw->ptl = pmd_lock(mm, pvmw->pmd); + pmde = *pvmw->pmd; + if (likely(pmd_trans_huge(pmde))) { + if (pvmw->flags & PVMW_MIGRATION) return not_found(pvmw); - if (is_migration_entry(pmd_to_swp_entry(*pvmw->pmd))) { - swp_entry_t entry = pmd_to_swp_entry(*pvmw->pmd); + if (pmd_page(pmde) != page) + return not_found(pvmw); + return true; + } + if (!pmd_present(pmde)) { + swp_entry_t entry; - if (migration_entry_to_page(entry) != page) - return not_found(pvmw); - return true; - } + if (!thp_migration_supported() || + !(pvmw->flags & PVMW_MIGRATION)) + return not_found(pvmw); + entry = pmd_to_swp_entry(pmde); + if (!is_migration_entry(entry) || + migration_entry_to_page(entry) != page) + return not_found(pvmw); + return true; } - return not_found(pvmw); - } else { /* THP pmd was split under us: handle on pte level */ spin_unlock(pvmw->ptl); pvmw->ptl = NULL; + } else if (!pmd_present(pmde)) { + /* + * If PVMW_SYNC, take and drop THP pmd lock so that we + * cannot return prematurely, while zap_huge_pmd() has + * cleared *pmd but not decremented compound_mapcount(). + */ + if ((pvmw->flags & PVMW_SYNC) && + PageTransCompound(page)) { + spinlock_t *ptl = pmd_lock(mm, pvmw->pmd); + + spin_unlock(ptl); + } + step_forward(pvmw, PMD_SIZE); + continue; } - } else if (!pmd_present(pmde)) { - return false; - } - if (!map_pte(pvmw)) - goto next_pte; - while (1) { + if (!map_pte(pvmw)) + goto next_pte; +this_pte: if (check_pte(pvmw)) return true; next_pte: - /* Seek to next pte only makes sense for THP */ - if (!PageTransHuge(pvmw->page) || PageHuge(pvmw->page)) - return not_found(pvmw); do { pvmw->address += PAGE_SIZE; - if (pvmw->address >= pvmw->vma->vm_end || - pvmw->address >= - __vma_address(pvmw->page, pvmw->vma) + - thp_size(pvmw->page)) + if (pvmw->address >= end) return not_found(pvmw); /* Did we cross page table boundary? */ - if (pvmw->address % PMD_SIZE == 0) { - pte_unmap(pvmw->pte); + if ((pvmw->address & (PMD_SIZE - PAGE_SIZE)) == 0) { if (pvmw->ptl) { spin_unlock(pvmw->ptl); pvmw->ptl = NULL; } + pte_unmap(pvmw->pte); + pvmw->pte = NULL; goto restart; - } else { - pvmw->pte++; + } + pvmw->pte++; + if ((pvmw->flags & PVMW_SYNC) && !pvmw->ptl) { + pvmw->ptl = pte_lockptr(mm, pvmw->pmd); + spin_lock(pvmw->ptl); } } while (pte_none(*pvmw->pte)); @@ -247,7 +286,10 @@ next_pte: pvmw->ptl = pte_lockptr(mm, pvmw->pmd); spin_lock(pvmw->ptl); } - } + goto this_pte; + } while (pvmw->address < end); + + return false; } /** @@ -266,14 +308,10 @@ int page_mapped_in_vma(struct page *page, struct vm_area_struct *vma) .vma = vma, .flags = PVMW_SYNC, }; - unsigned long start, end; - - start = __vma_address(page, vma); - end = start + thp_size(page) - PAGE_SIZE; - if (unlikely(end < vma->vm_start || start >= vma->vm_end)) + pvmw.address = vma_address(page, vma); + if (pvmw.address == -EFAULT) return 0; - pvmw.address = max(start, vma->vm_start); if (!page_vma_mapped_walk(&pvmw)) return 0; page_vma_mapped_walk_done(&pvmw); diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index c2210e1cdb51..4e640baf9794 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -135,9 +135,8 @@ pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, unsigned long address, { pmd_t pmd; VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(!pmd_present(*pmdp)); - /* Below assumes pmd_present() is true */ - VM_BUG_ON(!pmd_trans_huge(*pmdp) && !pmd_devmap(*pmdp)); + VM_BUG_ON(pmd_present(*pmdp) && !pmd_trans_huge(*pmdp) && + !pmd_devmap(*pmdp)); pmd = pmdp_huge_get_and_clear(vma->vm_mm, address, pmdp); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); return pmd; diff --git a/mm/rmap.c b/mm/rmap.c index 693a610e181d..e05c300048e6 100644 --- a/mm/rmap.c +++ b/mm/rmap.c @@ -707,7 +707,6 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags) */ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) { - unsigned long address; if (PageAnon(page)) { struct anon_vma *page__anon_vma = page_anon_vma(page); /* @@ -717,15 +716,13 @@ unsigned long page_address_in_vma(struct page *page, struct vm_area_struct *vma) if (!vma->anon_vma || !page__anon_vma || vma->anon_vma->root != page__anon_vma->root) return -EFAULT; - } else if (page->mapping) { - if (!vma->vm_file || vma->vm_file->f_mapping != page->mapping) - return -EFAULT; - } else + } else if (!vma->vm_file) { return -EFAULT; - address = __vma_address(page, vma); - if (unlikely(address < vma->vm_start || address >= vma->vm_end)) + } else if (vma->vm_file->f_mapping != compound_head(page)->mapping) { return -EFAULT; - return address; + } + + return vma_address(page, vma); } pmd_t *mm_find_pmd(struct mm_struct *mm, unsigned long address) @@ -919,7 +916,7 @@ static bool page_mkclean_one(struct page *page, struct vm_area_struct *vma, */ mmu_notifier_range_init(&range, MMU_NOTIFY_PROTECTION_PAGE, 0, vma, vma->vm_mm, address, - min(vma->vm_end, address + page_size(page))); + vma_address_end(page, vma)); mmu_notifier_invalidate_range_start(&range); while (page_vma_mapped_walk(&pvmw)) { @@ -1405,6 +1402,15 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, struct mmu_notifier_range range; enum ttu_flags flags = (enum ttu_flags)(long)arg; + /* + * When racing against e.g. zap_pte_range() on another cpu, + * in between its ptep_get_and_clear_full() and page_remove_rmap(), + * try_to_unmap() may return false when it is about to become true, + * if page table locking is skipped: use TTU_SYNC to wait for that. + */ + if (flags & TTU_SYNC) + pvmw.flags = PVMW_SYNC; + /* munlock has nothing to gain from examining un-locked vmas */ if ((flags & TTU_MUNLOCK) && !(vma->vm_flags & VM_LOCKED)) return true; @@ -1426,9 +1432,10 @@ static bool try_to_unmap_one(struct page *page, struct vm_area_struct *vma, * Note that the page can not be free in this function as call of * try_to_unmap() must hold a reference on the page. */ + range.end = PageKsm(page) ? + address + PAGE_SIZE : vma_address_end(page, vma); mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma, vma->vm_mm, - address, - min(vma->vm_end, address + page_size(page))); + address, range.end); if (PageHuge(page)) { /* * If sharing is possible, start and end will be adjusted @@ -1777,7 +1784,13 @@ bool try_to_unmap(struct page *page, enum ttu_flags flags) else rmap_walk(page, &rwc); - return !page_mapcount(page) ? true : false; + /* + * When racing against e.g. zap_pte_range() on another cpu, + * in between its ptep_get_and_clear_full() and page_remove_rmap(), + * try_to_unmap() may return false when it is about to become true, + * if page table locking is skipped: use TTU_SYNC to wait for that. + */ + return !page_mapcount(page); } /** @@ -1874,6 +1887,7 @@ static void rmap_walk_anon(struct page *page, struct rmap_walk_control *rwc, struct vm_area_struct *vma = avc->vma; unsigned long address = vma_address(page, vma); + VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) @@ -1928,6 +1942,7 @@ static void rmap_walk_file(struct page *page, struct rmap_walk_control *rwc, pgoff_start, pgoff_end) { unsigned long address = vma_address(page, vma); + VM_BUG_ON_VMA(address == -EFAULT, vma); cond_resched(); if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) diff --git a/mm/shmem.c b/mm/shmem.c index a08cedefbfaa..14997a98410c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2227,7 +2227,7 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, } #endif -int shmem_lock(struct file *file, int lock, struct user_struct *user) +int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { struct inode *inode = file_inode(file); struct shmem_inode_info *info = SHMEM_I(inode); @@ -2239,13 +2239,13 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) * no serialization needed when called from shm_destroy(). */ if (lock && !(info->flags & VM_LOCKED)) { - if (!user_shm_lock(inode->i_size, user)) + if (!user_shm_lock(inode->i_size, ucounts)) goto out_nomem; info->flags |= VM_LOCKED; mapping_set_unevictable(file->f_mapping); } - if (!lock && (info->flags & VM_LOCKED) && user) { - user_shm_unlock(inode->i_size, user); + if (!lock && (info->flags & VM_LOCKED) && ucounts) { + user_shm_unlock(inode->i_size, ucounts); info->flags &= ~VM_LOCKED; mapping_clear_unevictable(file->f_mapping); } @@ -2258,25 +2258,11 @@ out_nomem: static int shmem_mmap(struct file *file, struct vm_area_struct *vma) { struct shmem_inode_info *info = SHMEM_I(file_inode(file)); + int ret; - if (info->seals & F_SEAL_FUTURE_WRITE) { - /* - * New PROT_WRITE and MAP_SHARED mmaps are not allowed when - * "future write" seal active. - */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return -EPERM; - - /* - * Since an F_SEAL_FUTURE_WRITE sealed memfd can be mapped as - * MAP_SHARED and read-only, take care to not allow mprotect to - * revert protections on such mappings. Do this only for shared - * mappings. For private mappings, don't need to mask - * VM_MAYWRITE as we still want them to be COW-writable. - */ - if (vma->vm_flags & VM_SHARED) - vma->vm_flags &= ~(VM_MAYWRITE); - } + ret = seal_check_future_write(info->seals, vma); + if (ret) + return ret; /* arm64 - allow memory tagging on RAM-based files */ vma->vm_flags |= VM_MTE_ALLOWED; @@ -2375,8 +2361,18 @@ static int shmem_mfill_atomic_pte(struct mm_struct *dst_mm, pgoff_t offset, max_off; ret = -ENOMEM; - if (!shmem_inode_acct_block(inode, 1)) + if (!shmem_inode_acct_block(inode, 1)) { + /* + * We may have got a page, returned -ENOENT triggering a retry, + * and now we find ourselves with -ENOMEM. Release the page, to + * avoid a BUG_ON in our caller. + */ + if (unlikely(*pagep)) { + put_page(*pagep); + *pagep = NULL; + } goto out; + } if (!*pagep) { page = shmem_alloc_page(gfp, info, pgoff); @@ -4096,7 +4092,7 @@ int shmem_unuse(unsigned int type, bool frontswap, return 0; } -int shmem_lock(struct file *file, int lock, struct user_struct *user) +int shmem_lock(struct file *file, int lock, struct ucounts *ucounts) { return 0; } diff --git a/mm/shuffle.h b/mm/shuffle.h index 71b784f0b7c3..cec62984f7d3 100644 --- a/mm/shuffle.h +++ b/mm/shuffle.h @@ -10,7 +10,7 @@ DECLARE_STATIC_KEY_FALSE(page_alloc_shuffle_key); extern void __shuffle_free_memory(pg_data_t *pgdat); extern bool shuffle_pick_tail(void); -static inline void shuffle_free_memory(pg_data_t *pgdat) +static inline void __meminit shuffle_free_memory(pg_data_t *pgdat) { if (!static_branch_unlikely(&page_alloc_shuffle_key)) return; @@ -18,7 +18,7 @@ static inline void shuffle_free_memory(pg_data_t *pgdat) } extern void __shuffle_zone(struct zone *z); -static inline void shuffle_zone(struct zone *z) +static inline void __meminit shuffle_zone(struct zone *z) { if (!static_branch_unlikely(&page_alloc_shuffle_key)) return; diff --git a/mm/slab_common.c b/mm/slab_common.c index f8833d3e5d47..7cab77655f11 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -97,8 +97,7 @@ EXPORT_SYMBOL(kmem_cache_size); #ifdef CONFIG_DEBUG_VM static int kmem_cache_sanity_check(const char *name, unsigned int size) { - if (!name || in_interrupt() || size < sizeof(void *) || - size > KMALLOC_MAX_SIZE) { + if (!name || in_interrupt() || size > KMALLOC_MAX_SIZE) { pr_err("kmem_cache_create(%s) integrity check failed\n", name); return -EINVAL; } @@ -318,6 +317,16 @@ kmem_cache_create_usercopy(const char *name, const char *cache_name; int err; +#ifdef CONFIG_SLUB_DEBUG + /* + * If no slub_debug was enabled globally, the static key is not yet + * enabled by setup_slub_debug(). Enable it if the cache is being + * created with any of the debugging flags passed explicitly. + */ + if (flags & SLAB_DEBUG_FLAGS) + static_branch_enable(&slub_debug_enabled); +#endif + mutex_lock(&slab_mutex); err = kmem_cache_sanity_check(name, size); diff --git a/mm/slub.c b/mm/slub.c index feda53ae62ba..61bd40e3eb9a 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -15,6 +15,7 @@ #include <linux/module.h> #include <linux/bit_spinlock.h> #include <linux/interrupt.h> +#include <linux/swab.h> #include <linux/bitops.h> #include <linux/slab.h> #include "slab.h" @@ -301,6 +302,7 @@ static inline void *get_freepointer_safe(struct kmem_cache *s, void *object) if (!debug_pagealloc_enabled_static()) return get_freepointer(s, object); + object = kasan_reset_tag(object); freepointer_addr = (unsigned long)object + s->offset; copy_from_kernel_nofault(&p, (void **)freepointer_addr, sizeof(p)); return freelist_ptr(s, p, freepointer_addr); @@ -711,15 +713,15 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) p, p - addr, get_freepointer(s, p)); if (s->flags & SLAB_RED_ZONE) - print_section(KERN_ERR, "Redzone ", p - s->red_left_pad, + print_section(KERN_ERR, "Redzone ", p - s->red_left_pad, s->red_left_pad); else if (p > addr + 16) print_section(KERN_ERR, "Bytes b4 ", p - 16, 16); - print_section(KERN_ERR, "Object ", p, + print_section(KERN_ERR, "Object ", p, min_t(unsigned int, s->object_size, PAGE_SIZE)); if (s->flags & SLAB_RED_ZONE) - print_section(KERN_ERR, "Redzone ", p + s->object_size, + print_section(KERN_ERR, "Redzone ", p + s->object_size, s->inuse - s->object_size); off = get_info_end(s); @@ -731,7 +733,7 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) if (off != size_from_object(s)) /* Beginning of the filler is the free pointer */ - print_section(KERN_ERR, "Padding ", p + off, + print_section(KERN_ERR, "Padding ", p + off, size_from_object(s) - off); dump_stack(); @@ -908,11 +910,11 @@ static int check_object(struct kmem_cache *s, struct page *page, u8 *endobject = object + s->object_size; if (s->flags & SLAB_RED_ZONE) { - if (!check_bytes_and_report(s, page, object, "Redzone", + if (!check_bytes_and_report(s, page, object, "Left Redzone", object - s->red_left_pad, val, s->red_left_pad)) return 0; - if (!check_bytes_and_report(s, page, object, "Redzone", + if (!check_bytes_and_report(s, page, object, "Right Redzone", endobject, val, s->inuse - s->object_size)) return 0; } else { @@ -927,7 +929,7 @@ static int check_object(struct kmem_cache *s, struct page *page, if (val != SLUB_RED_ACTIVE && (s->flags & __OBJECT_POISON) && (!check_bytes_and_report(s, page, p, "Poison", p, POISON_FREE, s->object_size - 1) || - !check_bytes_and_report(s, page, p, "Poison", + !check_bytes_and_report(s, page, p, "End Poison", p + s->object_size - 1, POISON_END, 1))) return 0; /* @@ -3688,7 +3690,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) { slab_flags_t flags = s->flags; unsigned int size = s->object_size; - unsigned int freepointer_area; unsigned int order; /* @@ -3697,13 +3698,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) * the possible location of the free pointer. */ size = ALIGN(size, sizeof(void *)); - /* - * This is the area of the object where a freepointer can be - * safely written. If redzoning adds more to the inuse size, we - * can't use that portion for writing the freepointer, so - * s->offset must be limited within this for the general case. - */ - freepointer_area = size; #ifdef CONFIG_SLUB_DEBUG /* @@ -3729,19 +3723,21 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) /* * With that we have determined the number of bytes in actual use - * by the object. This is the potential offset to the free pointer. + * by the object and redzoning. */ s->inuse = size; - if (((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || - s->ctor)) { + if ((flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)) || + ((flags & SLAB_RED_ZONE) && s->object_size < sizeof(void *)) || + s->ctor) { /* * Relocate free pointer after the object if it is not * permitted to overwrite the first word of the object on * kmem_cache_free. * * This is the case if we do RCU, have a constructor or - * destructor or are poisoning the objects. + * destructor, are poisoning the objects, or are + * redzoning an object smaller than sizeof(void *). * * The assumption that s->offset >= s->inuse means free * pointer is outside of the object is used in the @@ -3750,13 +3746,13 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) */ s->offset = size; size += sizeof(void *); - } else if (freepointer_area > sizeof(void *)) { + } else { /* * Store freelist pointer near middle of object to keep * it away from the edges of the object to avoid small * sized over/underflows from neighboring allocations. */ - s->offset = ALIGN(freepointer_area / 2, sizeof(void *)); + s->offset = ALIGN_DOWN(s->object_size / 2, sizeof(void *)); } #ifdef CONFIG_SLUB_DEBUG @@ -3828,15 +3824,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) { -#ifdef CONFIG_SLUB_DEBUG - /* - * If no slub_debug was enabled globally, the static key is not yet - * enabled by setup_slub_debug(). Enable it if the cache is being - * created with any of the debugging flags passed explicitly. - */ - if (flags & SLAB_DEBUG_FLAGS) - static_branch_enable(&slub_debug_enabled); -#endif s->flags = kmem_cache_flags(s->size, flags, s->name); #ifdef CONFIG_SLAB_FREELIST_HARDENED s->random = get_random_long(); diff --git a/mm/sparse.c b/mm/sparse.c index b2ada9dc00cb..55c18aff3e42 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -344,6 +344,15 @@ size_t mem_section_usage_size(void) return sizeof(struct mem_section_usage) + usemap_size(); } +static inline phys_addr_t pgdat_to_phys(struct pglist_data *pgdat) +{ +#ifndef CONFIG_NEED_MULTIPLE_NODES + return __pa_symbol(pgdat); +#else + return __pa(pgdat); +#endif +} + #ifdef CONFIG_MEMORY_HOTREMOVE static struct mem_section_usage * __init sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, @@ -362,7 +371,7 @@ sparse_early_usemaps_alloc_pgdat_section(struct pglist_data *pgdat, * from the same section as the pgdat where possible to avoid * this problem. */ - goal = __pa(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); + goal = pgdat_to_phys(pgdat) & (PAGE_SECTION_MASK << PAGE_SHIFT); limit = goal + (1UL << PA_SECTION_SHIFT); nid = early_pfn_to_nid(goal >> PAGE_SHIFT); again: @@ -390,7 +399,7 @@ static void __init check_usemap_section_nr(int nid, } usemap_snr = pfn_to_section_nr(__pa(usage) >> PAGE_SHIFT); - pgdat_snr = pfn_to_section_nr(__pa(pgdat) >> PAGE_SHIFT); + pgdat_snr = pfn_to_section_nr(pgdat_to_phys(pgdat) >> PAGE_SHIFT); if (usemap_snr == pgdat_snr) return; diff --git a/mm/swapfile.c b/mm/swapfile.c index 149e77454e3c..996afa8131c8 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1900,7 +1900,7 @@ unsigned int count_swap_pages(int type, int free) static inline int pte_same_as_swp(pte_t pte, pte_t swp_pte) { - return pte_same(pte_swp_clear_soft_dirty(pte), swp_pte); + return pte_same(pte_swp_clear_flags(pte), swp_pte); } /* diff --git a/mm/truncate.c b/mm/truncate.c index 95af244b112a..234ddd879caa 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -167,13 +167,10 @@ void do_invalidatepage(struct page *page, unsigned int offset, * its lock, b) when a concurrent invalidate_mapping_pages got there first and * c) when tmpfs swizzles a page between a tmpfs inode and swapper_space. */ -static void -truncate_cleanup_page(struct address_space *mapping, struct page *page) +static void truncate_cleanup_page(struct page *page) { - if (page_mapped(page)) { - unsigned int nr = thp_nr_pages(page); - unmap_mapping_pages(mapping, page->index, nr, false); - } + if (page_mapped(page)) + unmap_mapping_page(page); if (page_has_private(page)) do_invalidatepage(page, 0, thp_size(page)); @@ -218,7 +215,7 @@ int truncate_inode_page(struct address_space *mapping, struct page *page) if (page->mapping != mapping) return -EIO; - truncate_cleanup_page(mapping, page); + truncate_cleanup_page(page); delete_from_page_cache(page); return 0; } @@ -325,7 +322,7 @@ void truncate_inode_pages_range(struct address_space *mapping, index = indices[pagevec_count(&pvec) - 1] + 1; truncate_exceptional_pvec_entries(mapping, &pvec, indices); for (i = 0; i < pagevec_count(&pvec); i++) - truncate_cleanup_page(mapping, pvec.pages[i]); + truncate_cleanup_page(pvec.pages[i]); delete_from_page_cache_batch(mapping, &pvec); for (i = 0; i < pagevec_count(&pvec); i++) unlock_page(pvec.pages[i]); @@ -639,6 +636,16 @@ int invalidate_inode_pages2_range(struct address_space *mapping, continue; } + if (!did_range_unmap && page_mapped(page)) { + /* + * If page is mapped, before taking its lock, + * zap the rest of the file in one hit. + */ + unmap_mapping_pages(mapping, index, + (1 + end - index), false); + did_range_unmap = 1; + } + lock_page(page); WARN_ON(page_to_index(page) != index); if (page->mapping != mapping) { @@ -646,23 +653,11 @@ int invalidate_inode_pages2_range(struct address_space *mapping, continue; } wait_on_page_writeback(page); - if (page_mapped(page)) { - if (!did_range_unmap) { - /* - * Zap the rest of the file in one hit. - */ - unmap_mapping_pages(mapping, index, - (1 + end - index), false); - did_range_unmap = 1; - } else { - /* - * Just zap this page - */ - unmap_mapping_pages(mapping, index, - 1, false); - } - } + + if (page_mapped(page)) + unmap_mapping_page(page); BUG_ON(page_mapped(page)); + ret2 = do_launder_page(mapping, page); if (ret2 == 0) { if (!invalidate_complete_page2(mapping, page)) diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index e14b3820c6a8..63a73e164d55 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -360,38 +360,38 @@ out: * If a reservation for the page existed in the reservation * map of a private mapping, the map was modified to indicate * the reservation was consumed when the page was allocated. - * We clear the PagePrivate flag now so that the global + * We clear the HPageRestoreReserve flag now so that the global * reserve count will not be incremented in free_huge_page. * The reservation map will still indicate the reservation * was consumed and possibly prevent later page allocation. * This is better than leaking a global reservation. If no - * reservation existed, it is still safe to clear PagePrivate - * as no adjustments to reservation counts were made during - * allocation. + * reservation existed, it is still safe to clear + * HPageRestoreReserve as no adjustments to reservation counts + * were made during allocation. * * The reservation map for shared mappings indicates which * pages have reservations. When a huge page is allocated * for an address with a reservation, no change is made to - * the reserve map. In this case PagePrivate will be set - * to indicate that the global reservation count should be + * the reserve map. In this case HPageRestoreReserve will be + * set to indicate that the global reservation count should be * incremented when the page is freed. This is the desired * behavior. However, when a huge page is allocated for an * address without a reservation a reservation entry is added - * to the reservation map, and PagePrivate will not be set. - * When the page is freed, the global reserve count will NOT - * be incremented and it will appear as though we have leaked - * reserved page. In this case, set PagePrivate so that the - * global reserve count will be incremented to match the - * reservation map entry which was created. + * to the reservation map, and HPageRestoreReserve will not be + * set. When the page is freed, the global reserve count will + * NOT be incremented and it will appear as though we have + * leaked reserved page. In this case, set HPageRestoreReserve + * so that the global reserve count will be incremented to + * match the reservation map entry which was created. * * Note that vm_alloc_shared is based on the flags of the vma * for which the page was originally allocated. dst_vma could * be different or NULL on error. */ if (vm_alloc_shared) - SetPagePrivate(page); + SetHPageRestoreReserve(page); else - ClearPagePrivate(page); + ClearHPageRestoreReserve(page); put_page(page); } BUG_ON(copied < 0); diff --git a/mm/vmalloc.c b/mm/vmalloc.c index a13ac524f6ff..d0a7d89be091 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -2344,15 +2344,16 @@ static void clear_vm_uninitialized_flag(struct vm_struct *vm) } static struct vm_struct *__get_vm_area_node(unsigned long size, - unsigned long align, unsigned long flags, unsigned long start, - unsigned long end, int node, gfp_t gfp_mask, const void *caller) + unsigned long align, unsigned long shift, unsigned long flags, + unsigned long start, unsigned long end, int node, + gfp_t gfp_mask, const void *caller) { struct vmap_area *va; struct vm_struct *area; unsigned long requested_size = size; BUG_ON(in_interrupt()); - size = PAGE_ALIGN(size); + size = ALIGN(size, 1ul << shift); if (unlikely(!size)) return NULL; @@ -2384,8 +2385,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, unsigned long start, unsigned long end, const void *caller) { - return __get_vm_area_node(size, 1, flags, start, end, NUMA_NO_NODE, - GFP_KERNEL, caller); + return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, start, end, + NUMA_NO_NODE, GFP_KERNEL, caller); } /** @@ -2401,7 +2402,8 @@ struct vm_struct *__get_vm_area_caller(unsigned long size, unsigned long flags, */ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) { - return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, + return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, + VMALLOC_START, VMALLOC_END, NUMA_NO_NODE, GFP_KERNEL, __builtin_return_address(0)); } @@ -2409,7 +2411,8 @@ struct vm_struct *get_vm_area(unsigned long size, unsigned long flags) struct vm_struct *get_vm_area_caller(unsigned long size, unsigned long flags, const void *caller) { - return __get_vm_area_node(size, 1, flags, VMALLOC_START, VMALLOC_END, + return __get_vm_area_node(size, 1, PAGE_SHIFT, flags, + VMALLOC_START, VMALLOC_END, NUMA_NO_NODE, GFP_KERNEL, caller); } @@ -2902,9 +2905,9 @@ void *__vmalloc_node_range(unsigned long size, unsigned long align, } again: - size = PAGE_ALIGN(size); - area = __get_vm_area_node(size, align, VM_ALLOC | VM_UNINITIALIZED | - vm_flags, start, end, node, gfp_mask, caller); + area = __get_vm_area_node(real_size, align, shift, VM_ALLOC | + VM_UNINITIALIZED | vm_flags, start, end, node, + gfp_mask, caller); if (!area) { warn_alloc(gfp_mask, NULL, "vmalloc size %lu allocation failure: " @@ -2923,6 +2926,7 @@ again: */ clear_vm_uninitialized_flag(area); + size = PAGE_ALIGN(size); kmemleak_vmalloc(area, size, gfp_mask); return addr; @@ -2999,6 +3003,23 @@ void *vmalloc(unsigned long size) EXPORT_SYMBOL(vmalloc); /** + * vmalloc_no_huge - allocate virtually contiguous memory using small pages + * @size: allocation size + * + * Allocate enough non-huge pages to cover @size from the page level + * allocator and map them into contiguous kernel virtual space. + * + * Return: pointer to the allocated memory or %NULL on error + */ +void *vmalloc_no_huge(unsigned long size) +{ + return __vmalloc_node_range(size, 1, VMALLOC_START, VMALLOC_END, + GFP_KERNEL, PAGE_KERNEL, VM_NO_HUGE_VMAP, + NUMA_NO_NODE, __builtin_return_address(0)); +} +EXPORT_SYMBOL(vmalloc_no_huge); + +/** * vzalloc - allocate virtually contiguous memory with zero fill * @size: allocation size * |