diff options
author | Linus Torvalds <torvalds@linux-foundation.org> | 2022-10-10 17:53:04 -0700 |
---|---|---|
committer | Linus Torvalds <torvalds@linux-foundation.org> | 2022-10-10 17:53:04 -0700 |
commit | 27bc50fc90647bbf7b734c3fc306a5e61350da53 (patch) | |
tree | 75fc525fbfec8c07a97a7875a89592317bcad4ca /kernel/sched/fair.c | |
parent | 70442fc54e6889a2a77f0e9554e8188a1557f00e (diff) | |
parent | bbff39cc6cbcb86ccfacb2dcafc79912a9f9df69 (diff) | |
download | linux-27bc50fc90647bbf7b734c3fc306a5e61350da53.tar.gz |
Merge tag 'mm-stable-2022-10-08' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm
Pull MM updates from Andrew Morton: - Yu Zhao's Multi-Gen LRU patches are here. They've been under test in linux-next for a couple of months without, to my knowledge, any negative reports (or any positive ones, come to that). - Also the Maple Tree from Liam Howlett. An overlapping range-based tree for vmas. It it apparently slightly more efficient in its own right, but is mainly targeted at enabling work to reduce mmap_lock contention. Liam has identified a number of other tree users in the kernel which could be beneficially onverted to mapletrees. Yu Zhao has identified a hard-to-hit but "easy to fix" lockdep splat at [1]. This has yet to be addressed due to Liam's unfortunately timed vacation. He is now back and we'll get this fixed up. - Dmitry Vyukov introduces KMSAN: the Kernel Memory Sanitizer. It uses clang-generated instrumentation to detect used-unintialized bugs down to the single bit level. KMSAN keeps finding bugs. New ones, as well as the legacy ones. - Yang Shi adds a userspace mechanism (madvise) to induce a collapse of memory into THPs. - Zach O'Keefe has expanded Yang Shi's madvise(MADV_COLLAPSE) to support file/shmem-backed pages. - userfaultfd updates from Axel Rasmussen - zsmalloc cleanups from Alexey Romanov - cleanups from Miaohe Lin: vmscan, hugetlb_cgroup, hugetlb and memory-failure - Huang Ying adds enhancements to NUMA balancing memory tiering mode's page promotion, with a new way of detecting hot pages. - memcg updates from Shakeel Butt: charging optimizations and reduced memory consumption. - memcg cleanups from Kairui Song. - memcg fixes and cleanups from Johannes Weiner. - Vishal Moola provides more folio conversions - Zhang Yi removed ll_rw_block() :( - migration enhancements from Peter Xu - migration error-path bugfixes from Huang Ying - Aneesh Kumar added ability for a device driver to alter the memory tiering promotion paths. For optimizations by PMEM drivers, DRM drivers, etc. - vma merging improvements from Jakub Matěn. - NUMA hinting cleanups from David Hildenbrand. - xu xin added aditional userspace visibility into KSM merging activity. - THP & KSM code consolidation from Qi Zheng. - more folio work from Matthew Wilcox. - KASAN updates from Andrey Konovalov. - DAMON cleanups from Kaixu Xia. - DAMON work from SeongJae Park: fixes, cleanups. - hugetlb sysfs cleanups from Muchun Song. - Mike Kravetz fixes locking issues in hugetlbfs and in hugetlb core. Link: https://lkml.kernel.org/r/CAOUHufZabH85CeUN-MEMgL8gJGzJEWUrkiM58JkTbBhh-jew0Q@mail.gmail.com [1] * tag 'mm-stable-2022-10-08' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (555 commits) hugetlb: allocate vma lock for all sharable vmas hugetlb: take hugetlb vma_lock when clearing vma_lock->vma pointer hugetlb: fix vma lock handling during split vma and range unmapping mglru: mm/vmscan.c: fix imprecise comments mm/mglru: don't sync disk for each aging cycle mm: memcontrol: drop dead CONFIG_MEMCG_SWAP config symbol mm: memcontrol: use do_memsw_account() in a few more places mm: memcontrol: deprecate swapaccounting=0 mode mm: memcontrol: don't allocate cgroup swap arrays when memcg is disabled mm/secretmem: remove reduntant return value mm/hugetlb: add available_huge_pages() func mm: remove unused inline functions from include/linux/mm_inline.h selftests/vm: add selftest for MADV_COLLAPSE of uffd-minor memory selftests/vm: add file/shmem MADV_COLLAPSE selftest for cleared pmd selftests/vm: add thp collapse shmem testing selftests/vm: add thp collapse file and tmpfs testing selftests/vm: modularize thp collapse memory operations selftests/vm: dedup THP helpers mm/khugepaged: add tracepoint to hpage_collapse_scan_file() mm/madvise: add file and shmem support to MADV_COLLAPSE ...
Diffstat (limited to 'kernel/sched/fair.c')
-rw-r--r-- | kernel/sched/fair.c | 175 |
1 files changed, 172 insertions, 3 deletions
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 5ffec4370602..e4a0b8bd941c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -40,6 +40,7 @@ #include <linux/cpuidle.h> #include <linux/interrupt.h> +#include <linux/memory-tiers.h> #include <linux/mempolicy.h> #include <linux/mutex_api.h> #include <linux/profile.h> @@ -1090,6 +1091,12 @@ unsigned int sysctl_numa_balancing_scan_size = 256; /* Scan @scan_size MB every @scan_period after an initial @scan_delay in ms */ unsigned int sysctl_numa_balancing_scan_delay = 1000; +/* The page with hint page fault latency < threshold in ms is considered hot */ +unsigned int sysctl_numa_balancing_hot_threshold = MSEC_PER_SEC; + +/* Restrict the NUMA promotion throughput (MB/s) for each target node. */ +unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; + struct numa_group { refcount_t refcount; @@ -1432,6 +1439,120 @@ static inline unsigned long group_weight(struct task_struct *p, int nid, return 1000 * faults / total_faults; } +/* + * If memory tiering mode is enabled, cpupid of slow memory page is + * used to record scan time instead of CPU and PID. When tiering mode + * is disabled at run time, the scan time (in cpupid) will be + * interpreted as CPU and PID. So CPU needs to be checked to avoid to + * access out of array bound. + */ +static inline bool cpupid_valid(int cpupid) +{ + return cpupid_to_cpu(cpupid) < nr_cpu_ids; +} + +/* + * For memory tiering mode, if there are enough free pages (more than + * enough watermark defined here) in fast memory node, to take full + * advantage of fast memory capacity, all recently accessed slow + * memory pages will be migrated to fast memory node without + * considering hot threshold. + */ +static bool pgdat_free_space_enough(struct pglist_data *pgdat) +{ + int z; + unsigned long enough_wmark; + + enough_wmark = max(1UL * 1024 * 1024 * 1024 >> PAGE_SHIFT, + pgdat->node_present_pages >> 4); + for (z = pgdat->nr_zones - 1; z >= 0; z--) { + struct zone *zone = pgdat->node_zones + z; + + if (!populated_zone(zone)) + continue; + + if (zone_watermark_ok(zone, 0, + wmark_pages(zone, WMARK_PROMO) + enough_wmark, + ZONE_MOVABLE, 0)) + return true; + } + return false; +} + +/* + * For memory tiering mode, when page tables are scanned, the scan + * time will be recorded in struct page in addition to make page + * PROT_NONE for slow memory page. So when the page is accessed, in + * hint page fault handler, the hint page fault latency is calculated + * via, + * + * hint page fault latency = hint page fault time - scan time + * + * The smaller the hint page fault latency, the higher the possibility + * for the page to be hot. + */ +static int numa_hint_fault_latency(struct page *page) +{ + int last_time, time; + + time = jiffies_to_msecs(jiffies); + last_time = xchg_page_access_time(page, time); + + return (time - last_time) & PAGE_ACCESS_TIME_MASK; +} + +/* + * For memory tiering mode, too high promotion/demotion throughput may + * hurt application latency. So we provide a mechanism to rate limit + * the number of pages that are tried to be promoted. + */ +static bool numa_promotion_rate_limit(struct pglist_data *pgdat, + unsigned long rate_limit, int nr) +{ + unsigned long nr_cand; + unsigned int now, start; + + now = jiffies_to_msecs(jiffies); + mod_node_page_state(pgdat, PGPROMOTE_CANDIDATE, nr); + nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); + start = pgdat->nbp_rl_start; + if (now - start > MSEC_PER_SEC && + cmpxchg(&pgdat->nbp_rl_start, start, now) == start) + pgdat->nbp_rl_nr_cand = nr_cand; + if (nr_cand - pgdat->nbp_rl_nr_cand >= rate_limit) + return true; + return false; +} + +#define NUMA_MIGRATION_ADJUST_STEPS 16 + +static void numa_promotion_adjust_threshold(struct pglist_data *pgdat, + unsigned long rate_limit, + unsigned int ref_th) +{ + unsigned int now, start, th_period, unit_th, th; + unsigned long nr_cand, ref_cand, diff_cand; + + now = jiffies_to_msecs(jiffies); + th_period = sysctl_numa_balancing_scan_period_max; + start = pgdat->nbp_th_start; + if (now - start > th_period && + cmpxchg(&pgdat->nbp_th_start, start, now) == start) { + ref_cand = rate_limit * + sysctl_numa_balancing_scan_period_max / MSEC_PER_SEC; + nr_cand = node_page_state(pgdat, PGPROMOTE_CANDIDATE); + diff_cand = nr_cand - pgdat->nbp_th_nr_cand; + unit_th = ref_th * 2 / NUMA_MIGRATION_ADJUST_STEPS; + th = pgdat->nbp_threshold ? : ref_th; + if (diff_cand > ref_cand * 11 / 10) + th = max(th - unit_th, unit_th); + else if (diff_cand < ref_cand * 9 / 10) + th = min(th + unit_th, ref_th * 2); + pgdat->nbp_th_nr_cand = nr_cand; + pgdat->nbp_threshold = th; + } +} + bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int src_nid, int dst_cpu) { @@ -1439,9 +1560,44 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page, int dst_nid = cpu_to_node(dst_cpu); int last_cpupid, this_cpupid; + /* + * The pages in slow memory node should be migrated according + * to hot/cold instead of private/shared. + */ + if (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING && + !node_is_toptier(src_nid)) { + struct pglist_data *pgdat; + unsigned long rate_limit; + unsigned int latency, th, def_th; + + pgdat = NODE_DATA(dst_nid); + if (pgdat_free_space_enough(pgdat)) { + /* workload changed, reset hot threshold */ + pgdat->nbp_threshold = 0; + return true; + } + + def_th = sysctl_numa_balancing_hot_threshold; + rate_limit = sysctl_numa_balancing_promote_rate_limit << \ + (20 - PAGE_SHIFT); + numa_promotion_adjust_threshold(pgdat, rate_limit, def_th); + + th = pgdat->nbp_threshold ? : def_th; + latency = numa_hint_fault_latency(page); + if (latency >= th) + return false; + + return !numa_promotion_rate_limit(pgdat, rate_limit, + thp_nr_pages(page)); + } + this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid); last_cpupid = page_cpupid_xchg_last(page, this_cpupid); + if (!(sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING) && + !node_is_toptier(src_nid) && !cpupid_valid(last_cpupid)) + return false; + /* * Allow first faults or private faults to migrate immediately early in * the lifetime of a task. The magic number 4 is based on waiting for @@ -2681,6 +2837,15 @@ void task_numa_fault(int last_cpupid, int mem_node, int pages, int flags) if (!p->mm) return; + /* + * NUMA faults statistics are unnecessary for the slow memory + * node for memory tiering mode. + */ + if (!node_is_toptier(mem_node) && + (sysctl_numa_balancing_mode & NUMA_BALANCING_MEMORY_TIERING || + !cpupid_valid(last_cpupid))) + return; + /* Allocate buffer to track faults on a per-node basis */ if (unlikely(!p->numa_faults)) { int size = sizeof(*p->numa_faults) * @@ -2761,6 +2926,7 @@ static void task_numa_work(struct callback_head *work) struct task_struct *p = current; struct mm_struct *mm = p->mm; u64 runtime = p->se.sum_exec_runtime; + MA_STATE(mas, &mm->mm_mt, 0, 0); struct vm_area_struct *vma; unsigned long start, end; unsigned long nr_pte_updates = 0; @@ -2817,13 +2983,16 @@ static void task_numa_work(struct callback_head *work) if (!mmap_read_trylock(mm)) return; - vma = find_vma(mm, start); + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); if (!vma) { reset_ptenuma_scan(p); start = 0; - vma = mm->mmap; + mas_set(&mas, start); + vma = mas_find(&mas, ULONG_MAX); } - for (; vma; vma = vma->vm_next) { + + for (; vma; vma = mas_find(&mas, ULONG_MAX)) { if (!vma_migratable(vma) || !vma_policy_mof(vma) || is_vm_hugetlb_page(vma) || (vma->vm_flags & VM_MIXEDMAP)) { continue; |